#include <Execute.h>

Collaboration diagram for Executor:

Classes
class	CgenStateManager

struct	ExecutorMutexHolder

class	FetchCacheAnchor

struct	GroupColLLVMValue

struct	JoinHashTableOrError

Public Types
enum	ExtModuleKinds { ExtModuleKinds::template_module, ExtModuleKinds::udf_cpu_module, ExtModuleKinds::udf_gpu_module, ExtModuleKinds::rt_udf_cpu_module, ExtModuleKinds::rt_udf_gpu_module, ExtModuleKinds::rt_geos_module, ExtModuleKinds::rt_libdevice_module }

using	ExecutorId = size_t

using	CachedCardinality = std::pair< bool, size_t >

Public Member Functions
	Executor (const ExecutorId id, Data_Namespace::DataMgr *data_mgr, const size_t block_size_x, const size_t grid_size_x, const size_t max_gpu_slab_size, const std::string &debug_dir, const std::string &debug_file)

void	clearCaches (bool runtime_only=false)

std::string	dumpCache () const

void	reset (bool discard_runtime_modules_only=false)

const std::unique_ptr < llvm::Module > &	get_rt_module () const

const std::unique_ptr < llvm::Module > &	get_udf_module (bool is_gpu=false) const

const std::unique_ptr < llvm::Module > &	get_rt_udf_module (bool is_gpu=false) const

const std::unique_ptr < llvm::Module > &	get_geos_module () const

const std::unique_ptr < llvm::Module > &	get_libdevice_module () const

bool	has_rt_module () const

bool	has_udf_module (bool is_gpu=false) const

bool	has_rt_udf_module (bool is_gpu=false) const

bool	has_geos_module () const

bool	has_libdevice_module () const

const TemporaryTables *	getTemporaryTables ()

StringDictionaryProxy *	getStringDictionaryProxy (const shared::StringDictKey &dict_key, const bool with_generation) const

StringDictionaryProxy *	getStringDictionaryProxy (const shared::StringDictKey &dict_key, const std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const

const StringDictionaryProxy::IdMap *	getStringProxyTranslationMap (const shared::StringDictKey &source_dict_key, const shared::StringDictKey &dest_dict_key, const RowSetMemoryOwner::StringTranslationType translation_type, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const

const StringDictionaryProxy::IdMap *	getJoinIntersectionStringProxyTranslationMap (const StringDictionaryProxy source_proxy, StringDictionaryProxy dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &source_string_op_infos, const std::vector< StringOps_Namespace::StringOpInfo > &dest_source_string_op_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner) const

const StringDictionaryProxy::TranslationMap < Datum > *	getStringProxyNumericTranslationMap (const shared::StringDictKey &source_dict_key, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const

bool	isCPUOnly () const

bool	isArchMaxwell (const ExecutorDeviceType dt) const

bool	containsLeftDeepOuterJoin () const

const ColumnDescriptor *	getColumnDescriptor (const Analyzer::ColumnVar *) const

const ColumnDescriptor *	getPhysicalColumnDescriptor (const Analyzer::ColumnVar *, int) const

Data_Namespace::DataMgr *	getDataMgr () const

const std::shared_ptr < RowSetMemoryOwner >	getRowSetMemoryOwner () const

const TemporaryTables *	getTemporaryTables () const

Fragmenter_Namespace::TableInfo	getTableInfo (const shared::TableKey &table_key) const

const TableGeneration &	getTableGeneration (const shared::TableKey &table_key) const

ExpressionRange	getColRange (const PhysicalInput &) const

size_t	getNumBytesForFetchedRow (const std::set< shared::TableKey > &table_keys_to_fetch) const

std::map< shared::ColumnKey, size_t >	getColumnByteWidthMap (const std::set< shared::TableKey > &table_ids_to_fetch, const bool include_lazy_fetched_cols) const

size_t	getNumBytesForFetchedRow (const std::set< int > &table_ids_to_fetch) const

ExecutorResourceMgr_Namespace::ChunkRequestInfo	getChunkRequestInfo (const ExecutorDeviceType device_type, const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos, const std::vector< std::pair< int32_t, FragmentsList >> &device_fragment_lists) const
	Determines a unique list of chunks and their associated byte sizes for a given query plan. More...

bool	hasLazyFetchColumns (const std::vector< Analyzer::Expr * > &target_exprs) const

std::vector< ColumnLazyFetchInfo >	getColLazyFetchInfo (const std::vector< Analyzer::Expr * > &target_exprs) const

void	interrupt (const QuerySessionId &query_session="", const QuerySessionId &interrupt_session="")

void	resetInterrupt ()

void	enableRuntimeQueryInterrupt (const double runtime_query_check_freq, const unsigned pending_query_check_freq) const

int8_t	warpSize () const

unsigned	gridSize () const

void	setGridSize (unsigned grid_size)

void	resetGridSize ()

unsigned	numBlocksPerMP () const

unsigned	blockSize () const

void	setBlockSize (unsigned block_size)

void	resetBlockSize ()

size_t	maxGpuSlabSize () const

ResultSetPtr	executeWorkUnit (size_t &max_groups_buffer_entry_guess, const bool is_agg, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)

TableUpdateMetadata	executeUpdate (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const TableDescriptor *updated_table_desc, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const UpdateLogForFragment::Callback &cb, const bool is_agg)

void	addTransientStringLiterals (const RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< RowSetMemoryOwner > &row_set_mem_owner)

int	deviceCount (const ExecutorDeviceType) const

void	logSystemCPUMemoryStatus (std::string const &tag, size_t const thread_idx) const

void	logSystemGPUMemoryStatus (std::string const &tag, size_t const thread_idx) const

void	setupCaching (const std::unordered_set< PhysicalInput > &phys_inputs, const std::unordered_set< shared::TableKey > &phys_table_keys)

void	setColRangeCache (const AggregatedColRange &aggregated_col_range)

ExecutorId	getExecutorId () const

QuerySessionId &	getCurrentQuerySession (heavyai::shared_lock< heavyai::shared_mutex > &read_lock)

QuerySessionStatus::QueryStatus	getQuerySessionStatus (const QuerySessionId &candidate_query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)

bool	checkCurrentQuerySession (const std::string &candidate_query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)

void	invalidateRunningQuerySession (heavyai::unique_lock< heavyai::shared_mutex > &write_lock)

bool	addToQuerySessionList (const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)

bool	removeFromQuerySessionList (const QuerySessionId &query_session, const std::string &submitted_time_str, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)

void	setQuerySessionAsInterrupted (const QuerySessionId &query_session, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)

bool	checkIsQuerySessionInterrupted (const std::string &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)

bool	checkIsQuerySessionEnrolled (const QuerySessionId &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)

bool	updateQuerySessionStatusWithLock (const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus updated_query_status, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)

bool	updateQuerySessionExecutorAssignment (const QuerySessionId &query_session, const std::string &submitted_time_str, const size_t executor_id, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)

std::vector< QuerySessionStatus >	getQuerySessionInfo (const QuerySessionId &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)

heavyai::shared_mutex &	getSessionLock ()

CurrentQueryStatus	attachExecutorToQuerySession (const QuerySessionId &query_session_id, const std::string &query_str, const std::string &query_submitted_time)

void	checkPendingQueryStatus (const QuerySessionId &query_session)

void	clearQuerySessionStatus (const QuerySessionId &query_session, const std::string &submitted_time_str)

void	updateQuerySessionStatus (const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus new_query_status)

void	enrollQuerySession (const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted_time_str, const size_t executor_id, const QuerySessionStatus::QueryStatus query_session_status)

size_t	getNumCurentSessionsEnrolled () const

const std::vector< size_t >	getExecutorIdsRunningQuery (const QuerySessionId &interrupt_session) const

bool	checkNonKernelTimeInterrupted () const

void	registerExtractedQueryPlanDag (const QueryPlanDAG &query_plan_dag)

const QueryPlanDAG	getLatestQueryPlanDagExtracted () const

void	addToCardinalityCache (const CardinalityCacheKey &cache_key, const size_t cache_value)

CachedCardinality	getCachedCardinality (const CardinalityCacheKey &cache_key)

heavyai::shared_mutex &	getDataRecyclerLock ()

QueryPlanDagCache &	getQueryPlanDagCache ()

ResultSetRecyclerHolder &	getResultSetRecyclerHolder ()

CgenState *	getCgenStatePtr () const

PlanState *	getPlanStatePtr () const

llvm::LLVMContext &	getContext ()

void	update_extension_modules (bool update_runtime_modules_only=false)

Static Public Member Functions
static void	clearExternalCaches (bool for_update, const TableDescriptor *td, const int current_db_id)

template<typename F >
static void	registerExtensionFunctions (F register_extension_functions)

static std::shared_ptr< Executor >	getExecutor (const ExecutorId id, const std::string &debug_dir="", const std::string &debug_file="", const SystemParameters &system_parameters=SystemParameters())

static void	nukeCacheOfExecutors ()

static void	clearMemory (const Data_Namespace::MemoryLevel memory_level)

static size_t	getArenaBlockSize ()

static void	addUdfIrToModule (const std::string &udf_ir_filename, const bool is_cuda_ir)

static void	initialize_extension_module_sources ()

static void	registerActiveModule (void *module, const int device_id)

static void	unregisterActiveModule (const int device_id)

static std::pair< int64_t, int32_t >	reduceResults (const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)

static void	clearCardinalityCache ()

static void	invalidateCardinalityCacheForTable (const shared::TableKey &table_key)

static void	update_after_registration (bool update_runtime_modules_only=false)

static void	init_resource_mgr (const size_t num_cpu_slots, const size_t num_gpu_slots, const size_t cpu_result_mem, const size_t cpu_buffer_pool_mem, const size_t gpu_buffer_pool_mem, const double per_query_max_cpu_slots_ratio, const double per_query_max_cpu_result_mem_ratio, const bool allow_cpu_kernel_concurrency, const bool allow_cpu_gpu_kernel_concurrency, const bool allow_cpu_slot_oversubscription_concurrency, const bool allow_cpu_result_mem_oversubscription, const double max_available_resource_use_ratio)

static void	pause_executor_queue ()

static void	resume_executor_queue ()

static size_t	get_executor_resource_pool_total_resource_quantity (const ExecutorResourceMgr_Namespace::ResourceType resource_type)

static ExecutorResourceMgr_Namespace::ResourcePoolInfo	get_executor_resource_pool_info ()

static void	set_executor_resource_pool_resource (const ExecutorResourceMgr_Namespace::ResourceType resource_type, const size_t resource_quantity)

static size_t	getBaselineThreshold (bool for_count_distinct, ExecutorDeviceType device_type)

static const ExecutorResourceMgr_Namespace::ConcurrentResourceGrantPolicy	get_concurrent_resource_grant_policy (const ExecutorResourceMgr_Namespace::ResourceType resource_type)

static void	set_concurrent_resource_grant_policy (const ExecutorResourceMgr_Namespace::ConcurrentResourceGrantPolicy &concurrent_resource_grant_policy)

Public Attributes
std::mutex	compilation_mutex_

Static Public Attributes
static constexpr ExecutorId	UNITARY_EXECUTOR_ID = 0

static constexpr ExecutorId	INVALID_EXECUTOR_ID = SIZE_MAX

static std::map < ExtModuleKinds, std::string >	extension_module_sources

static std::mutex	register_runtime_extension_functions_mutex_

static std::mutex	kernel_mutex_

static const size_t	auto_cpu_mem_bytes {size_t(0)}

static std::shared_ptr < ExecutorResourceMgr_Namespace::ExecutorResourceMgr >	executor_resource_mgr_ = nullptr

Private Types
using	PerFragmentCallBack = std::function< void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo &)>

Private Member Functions
void	clearMetaInfoCache ()

int	deviceCountForMemoryLevel (const Data_Namespace::MemoryLevel memory_level) const

llvm::Value *	codegenWindowFunction (const size_t target_index, const CompilationOptions &co)

llvm::Value *	codegenConditionalAggregateCondValSelector (llvm::Value *cond_lv, SQLAgg const aggKind, CompilationOptions const &co) const

llvm::Value *	codegenWindowFunctionAggregate (CodeGenerator *code_generator, const CompilationOptions &co)

std::pair< llvm::BasicBlock , llvm::Value >	codegenWindowResetStateControlFlow (CodeGenerator *code_generator, const CompilationOptions &co)

void	codegenWindowFunctionStateInit (CodeGenerator code_generator, const CompilationOptions &co, llvm::Value aggregate_state)

llvm::Value *	codegenWindowFunctionAggregateCalls (llvm::Value *aggregate_state, const CompilationOptions &co)

llvm::Value *	codegenWindowNavigationFunctionOnFrame (const CompilationOptions &co)

llvm::Value *	codegenCurrentPartitionIndex (const WindowFunctionContext window_func_context, CodeGenerator code_generator, const CompilationOptions &co, llvm::Value *current_row_pos_lv)

llvm::Value *	codegenFrameBoundExpr (const Analyzer::WindowFunction window_func, const Analyzer::WindowFrame frame_bound, CodeGenerator &code_generator, const CompilationOptions &co)

llvm::Value *	codegenFrameBound (bool for_start_bound, bool for_range_mode, bool for_window_frame_naviation, const Analyzer::WindowFrame frame_bound, bool is_timestamp_type_frame, llvm::Value order_key_null_val, const WindowFrameBoundFuncArgs &args)

std::pair< std::string, llvm::Value * >	codegenLoadOrderKeyBufPtr (WindowFunctionContext window_func_context, CodeGenerator code_generator, const CompilationOptions &co) const

std::pair< llvm::Value , llvm::Value >	codegenFrameNullRange (WindowFunctionContext window_func_context, CodeGenerator code_generator, const CompilationOptions &co, llvm::Value *partition_index_lv) const

WindowPartitionBufferPtrs	codegenLoadPartitionBuffers (WindowFunctionContext window_func_context, CodeGenerator code_generator, const CompilationOptions &co, llvm::Value *partition_index_lv) const

std::pair< llvm::Value , llvm::Value >	codegenWindowFrameBounds (WindowFunctionContext window_func_context, const Analyzer::WindowFrame frame_start_bound, const Analyzer::WindowFrame frame_end_bound, llvm::Value order_key_col_null_val_lv, WindowFrameBoundFuncArgs &args, CodeGenerator &code_generator)

std::pair< llvm::Value , llvm::Value >	codegenFrameBoundRange (const Analyzer::WindowFunction *window_func, CodeGenerator &code_generator, const CompilationOptions &co)

std::vector< llvm::Value * >	prepareRowModeFuncArgs (bool for_start_bound, SqlWindowFrameBoundType bound_type, const WindowFrameBoundFuncArgs &args) const

std::vector< llvm::Value * >	prepareRangeModeFuncArgs (bool for_start_bound, const Analyzer::WindowFrame frame_bound, bool is_timestamp_type_frame, llvm::Value order_key_null_val, const WindowFrameBoundFuncArgs &frame_args) const

const std::string	getOrderKeyTypeName (WindowFunctionContext *window_func_context) const

llvm::Value *	codegenLoadCurrentValueFromColBuf (WindowFunctionContext *window_func_context, CodeGenerator &code_generator, WindowFrameBoundFuncArgs &args) const

size_t	getOrderKeySize (WindowFunctionContext *window_func_context) const

const SQLTypeInfo	getFirstOrderColTypeInfo (WindowFunctionContext *window_func_context) const

std::string	getFramingFuncName (const std::string &bound_type, const std::string &order_col_type, const std::string &op_type, bool for_timestamp_type) const

void	codegenWindowAvgEpilogue (CodeGenerator code_generator, const CompilationOptions &co, llvm::Value crt_val, llvm::Value *window_func_null_val)

llvm::Value *	codegenAggregateWindowState (CodeGenerator code_generator, const CompilationOptions &co, llvm::Value aggregate_state)

llvm::Value *	aggregateWindowStatePtr (CodeGenerator *code_generator, const CompilationOptions &co)

CudaMgr_Namespace::CudaMgr *	cudaMgr () const

bool	isArchPascalOrLater (const ExecutorDeviceType dt) const

bool	needFetchAllFragments (const InputColDescriptor &col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments) const

bool	needLinearizeAllFragments (const ColumnDescriptor *cd, const InputColDescriptor &inner_col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments, const Data_Namespace::MemoryLevel memory_level) const

void	executeWorkUnitPerFragment (const RelAlgExecutionUnit &ra_exe_unit, const InputTableInfo &table_info, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, PerFragmentCallBack &cb, const std::set< size_t > &fragment_indexes_param)
	Compiles and dispatches a work unit per fragment processing results with the per fragment callback. Currently used for computing metrics over fragments (metadata). More...

ResultSetPtr	executeExplain (const QueryCompilationDescriptor &)

ResultSetPtr	executeTableFunction (const TableFunctionExecutionUnit exe_unit, const std::vector< InputTableInfo > &table_infos, const CompilationOptions &co, const ExecutionOptions &eo)
	Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps. More...

ExecutorDeviceType	getDeviceTypeForTargets (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType requested_device_type)

ResultSetPtr	collectAllDeviceResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)

ResultSetPtr	collectAllDeviceShardedTopResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type) const

std::unordered_map < shared::TableKey, const Analyzer::BinOper * >	getInnerTabIdToJoinCond () const

std::vector< std::unique_ptr < ExecutionKernel > >	createKernels (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, ColumnFetcher &column_fetcher, const std::vector< InputTableInfo > &table_infos, const ExecutionOptions &eo, const bool is_agg, const bool allow_single_frag_table_opt, const size_t context_count, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, RenderInfo *render_info, std::unordered_set< int > &available_gpus, int &available_cpus)

void	launchKernelsImpl (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type, const size_t requested_num_threads)

void	launchKernelsLocked (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type)

void	launchKernelsViaResourceMgr (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type, const std::vector< InputDescriptor > &input_descs, const QueryMemoryDescriptor &query_mem_desc)
	Launches a vector of kernels for a given query step, gated/scheduled by ExecutorResourceMgr. More...

std::vector< size_t >	getTableFragmentIndices (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type, const size_t table_idx, const size_t outer_frag_idx, std::map< shared::TableKey, const TableFragments * > &selected_tables_fragments, const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &inner_table_id_to_join_condition)

bool	skipFragmentPair (const Fragmenter_Namespace::FragmentInfo &outer_fragment_info, const Fragmenter_Namespace::FragmentInfo &inner_fragment_info, const int inner_table_id, const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &inner_table_id_to_join_condition, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)

FetchResult	fetchChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< shared::TableKey, const TableFragments * > &, const FragmentsList &selected_fragments, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)

FetchResult	fetchUnionChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< shared::TableKey, const TableFragments * > &, const FragmentsList &selected_fragments, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)

std::pair< std::vector < std::vector< int64_t > >, std::vector< std::vector < uint64_t > > >	getRowCountAndOffsetForAllFrags (const RelAlgExecutionUnit &ra_exe_unit, const CartesianProduct< std::vector< std::vector< size_t >>> &frag_ids_crossjoin, const std::vector< InputDescriptor > &input_descs, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments)

void	buildSelectedFragsMapping (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)

void	buildSelectedFragsMappingForUnion (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)

std::vector< size_t >	getFragmentCount (const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)

int32_t	executePlanWithGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext , const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr , const int device_id, const shared::TableKey &outer_table_key, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const bool optimize_cuda_block_and_grid_sizes, const int64_t rows_to_process=-1)

int32_t	executePlanWithoutGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr results, const std::vector< Analyzer::Expr > &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, QueryExecutionContext query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const bool optimize_cuda_block_and_grid_sizes, const int64_t rows_to_process=-1)

ResultSetPtr	resultsUnion (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit)

std::vector< int8_t * >	getJoinHashTablePtrs (const ExecutorDeviceType device_type, const int device_id)

ResultSetPtr	reduceMultiDeviceResults (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const

std::vector< std::pair < ResultSetPtr, std::vector < size_t > > >	getUniqueThreadSharedResultSets (const std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &results_per_device) const

ResultSetPtr	reduceMultiDeviceResultSets (std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const

ResultSetPtr	reduceSpeculativeTopN (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const

ResultSetPtr	executeWorkUnitImpl (size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)

std::vector< llvm::Value * >	inlineHoistedLiterals ()

void	AutoTrackBuffersInRuntimeIR ()

std::tuple< CompilationResult, std::unique_ptr < QueryMemoryDescriptor > >	compileWorkUnit (const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo render_info=nullptr)

llvm::BasicBlock *	codegenSkipDeletedOuterTableRow (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)

std::vector< JoinLoop >	buildJoinLoops (RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)

JoinLoop::HoistedFiltersCallback	buildHoistLeftHandSideFiltersCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const shared::TableKey &inner_table_key, const CompilationOptions &co)

std::function< llvm::Value (const std::vector < llvm::Value > &, llvm::Value *)>	buildIsDeletedCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)

std::shared_ptr< HashJoin >	buildCurrentLevelHashTable (const JoinCondition &current_level_join_conditions, size_t level_idx, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)

void	redeclareFilterFunction ()

llvm::Value *	addJoinLoopIterator (const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)

void	codegenJoinLoops (const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function query_func, llvm::BasicBlock entry_bb, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)

bool	compileBody (const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})

void	createErrorCheckControlFlow (llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)

void	insertErrorCodeChecker (llvm::Function *query_func, unsigned const error_code_idx, bool hoist_literals, bool allow_runtime_query_interrupt)

void	preloadFragOffsets (const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)

JoinHashTableOrError	buildHashTableForQualifier (const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, ColumnCacheMap &column_cache, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)

void	nukeOldState (const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)

std::shared_ptr < CompilationContext >	optimizeAndCodegenCPU (llvm::Function , llvm::Function , const std::unordered_set< llvm::Function * > &, const CompilationOptions &)

std::shared_ptr < CompilationContext >	optimizeAndCodegenGPU (llvm::Function , llvm::Function , std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)

std::string	generatePTX (const std::string &) const

void	initializeNVPTXBackend () const

int64_t	deviceCycles (int milliseconds) const

GroupColLLVMValue	groupByColumnCodegen (Analyzer::Expr group_by_col, const size_t col_width, const CompilationOptions &, const bool translate_null_val, const int64_t translated_null_val, DiamondCodegen &, std::stack< llvm::BasicBlock > &, const bool thread_mem_shared)

llvm::Value *	castToFP (llvm::Value *, SQLTypeInfo const &from_ti, SQLTypeInfo const &to_ti)

llvm::Value *	castToIntPtrTyIn (llvm::Value *val, const size_t bit_width)

std::tuple < RelAlgExecutionUnit, PlanState::DeletedColumnsMap >	addDeletedColumn (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)

bool	isFragmentFullyDeleted (const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &fragment)

FragmentSkipStatus	canSkipFragmentForFpQual (const Analyzer::BinOper comp_expr, const Analyzer::ColumnVar lhs_col, const Fragmenter_Namespace::FragmentInfo &fragment, const Analyzer::Constant *rhs_const) const

std::pair< bool, int64_t >	skipFragment (const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &frag_info, const std::list< std::shared_ptr< Analyzer::Expr >> &simple_quals, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)

std::pair< bool, int64_t >	skipFragmentInnerJoins (const InputDescriptor &table_desc, const RelAlgExecutionUnit &ra_exe_unit, const Fragmenter_Namespace::FragmentInfo &fragment, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)

AggregatedColRange	computeColRangesCache (const std::unordered_set< PhysicalInput > &phys_inputs)

StringDictionaryGenerations	computeStringDictionaryGenerations (const std::unordered_set< PhysicalInput > &phys_inputs)

TableGenerations	computeTableGenerations (const std::unordered_set< shared::TableKey > &phys_table_keys)

std::vector< int8_t >	serializeLiterals (const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)

const std::unique_ptr < llvm::Module > &	get_extension_module (ExtModuleKinds kind) const

bool	has_extension_module (ExtModuleKinds kind) const

llvm::Value *	spillDoubleElement (llvm::Value elem_val, llvm::Type elem_ty)

ExecutorMutexHolder	acquireExecuteMutex ()

Static Private Member Functions
static size_t	align (const size_t off_in, const size_t alignment)

Private Attributes
const ExecutorId	executor_id_

std::unique_ptr < llvm::LLVMContext >	context_

std::unique_ptr< CgenState >	cgen_state_

std::map< ExtModuleKinds, std::unique_ptr< llvm::Module > >	extension_modules_

std::unique_ptr< PlanState >	plan_state_

std::shared_ptr < RowSetMemoryOwner >	row_set_mem_owner_

std::mutex	gpu_exec_mutex_ [max_gpu_count]

std::atomic< bool >	interrupted_ {false}

std::mutex	str_dict_mutex_

std::unique_ptr < llvm::TargetMachine >	nvptx_target_machine_

unsigned	block_size_x_

unsigned	grid_size_x_

const size_t	max_gpu_slab_size_

const std::string	debug_dir_

const std::string	debug_file_

Data_Namespace::DataMgr *	data_mgr_

const TemporaryTables *	temporary_tables_

TableIdToNodeMap	table_id_to_node_map_

int64_t	kernel_queue_time_ms_ = 0

int64_t	compilation_queue_time_ms_ = 0

std::unique_ptr < WindowProjectNodeContext >	window_project_node_context_owned_

WindowFunctionContext *	active_window_function_ {nullptr}

InputTableInfoCache	input_table_info_cache_

AggregatedColRange	agg_col_range_cache_

TableGenerations	table_generations_

QuerySessionId	current_query_session_

Static Private Attributes
static const int	max_gpu_count

static const size_t	auto_num_threads {size_t(0)}

static std::mutex	gpu_active_modules_mutex_

static uint32_t	gpu_active_modules_device_mask_ {0x0}

static void *	gpu_active_modules_ [max_gpu_count]

static const size_t	baseline_threshold

static heavyai::shared_mutex	executor_session_mutex_

static InterruptFlagMap	queries_interrupt_flag_

static QuerySessionMap	queries_session_map_

static std::map< int, std::shared_ptr< Executor > >	executors_

static heavyai::shared_mutex	execute_mutex_

static heavyai::shared_mutex	executors_cache_mutex_

static QueryPlanDagCache	query_plan_dag_cache_

static heavyai::shared_mutex	recycler_mutex_

static std::unordered_map < CardinalityCacheKey, size_t >	cardinality_cache_

static ResultSetRecyclerHolder	resultset_recycler_holder_

static QueryPlanDAG	latest_query_plan_extracted_ {EMPTY_QUERY_PLAN}

Friends
class	BaselineJoinHashTable

class	CodeGenerator

class	ColumnFetcher

struct	DiamondCodegen

class	ExecutionKernel

class	KernelSubtask

class	HashJoin

class	BoundingBoxIntersectJoinHashTable

class	RangeJoinHashTable

class	GroupByAndAggregate

class	QueryCompilationDescriptor

class	QueryMemoryDescriptor

class	QueryMemoryInitializer

class	QueryFragmentDescriptor

class	QueryExecutionContext

class	ResultSet

class	InValuesBitmap

class	StringDictionaryTranslationMgr

class	LeafAggregator

class	PerfectJoinHashTable

class	QueryRewriter

class	PendingExecutionClosure

class	RelAlgExecutor

class	TableOptimizer

class	TableFunctionCompilationContext

class	TableFunctionExecutionContext

struct	TargetExprCodegenBuilder

struct	TargetExprCodegen

class	WindowProjectNodeContext

Detailed Description

Definition at line 415 of file Execute.h.

Member Typedef Documentation

using Executor::CachedCardinality = std::pair<bool, size_t>

Definition at line 1403 of file Execute.h.

using Executor::ExecutorId = size_t

Definition at line 422 of file Execute.h.

using Executor::PerFragmentCallBack = std::function<void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo&)>

private

Definition at line 890 of file Execute.h.

Member Enumeration Documentation

enum Executor::ExtModuleKinds

strong

Enumerator
template_module
udf_cpu_module
udf_gpu_module
rt_udf_cpu_module
rt_udf_gpu_module
rt_geos_module
rt_libdevice_module

Definition at line 518 of file Execute.h.

                             {
     template_module,     // RuntimeFunctions.bc
     udf_cpu_module,      // Load-time UDFs for CPU execution
     udf_gpu_module,      // Load-time UDFs for GPU execution
     rt_udf_cpu_module,   // Run-time UDF/UDTFs for CPU execution
     rt_udf_gpu_module,   // Run-time UDF/UDTFs for GPU execution
     rt_geos_module,      // geos functions
     rt_libdevice_module  // math library functions for GPU execution
   };

Constructor & Destructor Documentation

Executor::Executor	(	const ExecutorId	id,
		Data_Namespace::DataMgr *	data_mgr,
		const size_t	block_size_x,
		const size_t	grid_size_x,
		const size_t	max_gpu_slab_size,
		const std::string &	debug_dir,
		const std::string &	debug_file
	)

Definition at line 276 of file Execute.cpp.

     : executor_id_(executor_id)
     , context_(new llvm::LLVMContext())
     , cgen_state_(new CgenState({}, false, this))

Member Function Documentation

ExecutorMutexHolder Executor::acquireExecuteMutex ( )

inlineprivate

Definition at line 1591 of file Execute.h.

References execute_mutex_, executor_id_, Executor::ExecutorMutexHolder::shared_lock, Executor::ExecutorMutexHolder::unique_lock, and UNITARY_EXECUTOR_ID.

                                                    {
     ExecutorMutexHolder ret;
     if (executor_id_ == Executor::UNITARY_EXECUTOR_ID) {
       // Only one unitary executor can run at a time
       ret.unique_lock = heavyai::unique_lock<heavyai::shared_mutex>(execute_mutex_);
     } else {
       ret.shared_lock = heavyai::shared_lock<heavyai::shared_mutex>(execute_mutex_);
     }
     return ret;
   }

std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > Executor::addDeletedColumn	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const CompilationOptions &	co
	)

private

Definition at line 4475 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::add_deleted_col_to_map(), CHECK, CompilationOptions::filter_on_deleted_column, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), and TABLE.

Referenced by executeWorkUnitImpl(), and executeWorkUnitPerFragment().

                                   {
   if (!co.filter_on_deleted_column) {
     return std::make_tuple(ra_exe_unit, PlanState::DeletedColumnsMap{});
   }
   auto ra_exe_unit_with_deleted = ra_exe_unit;
   PlanState::DeletedColumnsMap deleted_cols_map;
   for (const auto& input_table : ra_exe_unit_with_deleted.input_descs) {
     if (input_table.getSourceType() != InputSourceType::TABLE) {
       continue;
     }
     const auto& table_key = input_table.getTableKey();
     const auto catalog =
         Catalog_Namespace::SysCatalog::instance().getCatalog(table_key.db_id);
     CHECK(catalog);
     const auto td = catalog->getMetadataForTable(table_key.table_id);
     CHECK(td);
     const auto deleted_cd = catalog->getDeletedColumnIfRowsDeleted(td);
     if (!deleted_cd) {
       continue;
     }
     CHECK(deleted_cd->columnType.is_boolean());
     // check deleted column is not already present
     bool found = false;
     for (const auto& input_col : ra_exe_unit_with_deleted.input_col_descs) {
       if (input_col.get()->getColId() == deleted_cd->columnId &&
           input_col.get()->getScanDesc().getTableKey() == table_key &&
           input_col.get()->getScanDesc().getNestLevel() == input_table.getNestLevel()) {
         found = true;
         add_deleted_col_to_map(deleted_cols_map, deleted_cd, table_key);
         break;
       }
     }
     if (!found) {
       // add deleted column
       ra_exe_unit_with_deleted.input_col_descs.emplace_back(
           new InputColDescriptor(deleted_cd->columnId,
                                  deleted_cd->tableId,
                                  table_key.db_id,
                                  input_table.getNestLevel()));
       add_deleted_col_to_map(deleted_cols_map, deleted_cd, table_key);
     }
   }
   return std::make_tuple(ra_exe_unit_with_deleted, deleted_cols_map);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

llvm::Value * Executor::addJoinLoopIterator	(	const std::vector< llvm::Value * > &	prev_iters,
		const size_t	level_idx
	)

private

Definition at line 1186 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, and CHECK.

                                                                    {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   // Iterators are added for loop-outer joins when the head of the loop is generated,
   // then once again when the body if generated. Allow this instead of special handling
   // of call sites.
   const auto it = cgen_state_->scan_idx_to_hash_pos_.find(level_idx);
   if (it != cgen_state_->scan_idx_to_hash_pos_.end()) {
     return it->second;
   }
   CHECK(!prev_iters.empty());
   llvm::Value* matching_row_index = prev_iters.back();
   const auto it_ok =
       cgen_state_->scan_idx_to_hash_pos_.emplace(level_idx, matching_row_index);
   CHECK(it_ok.second);
   return matching_row_index;
 }

void Executor::addToCardinalityCache	(	const CardinalityCacheKey &	cache_key,
		const size_t	cache_value
	)

Definition at line 5289 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, recycler_mutex_, and VLOG.

                                                                {
   if (g_use_estimator_result_cache) {
     heavyai::unique_lock<heavyai::shared_mutex> lock(recycler_mutex_);
     cardinality_cache_[cache_key] = cache_value;
     VLOG(1) << "Put estimated cardinality to the cache";
   }
 }

bool Executor::addToQuerySessionList	(	const QuerySessionId &	query_session,
		const std::string &	query_str,
		const std::string &	submitted,
		const size_t	executor_id,
		const QuerySessionStatus::QueryStatus	query_status,
		heavyai::unique_lock< heavyai::shared_mutex > &	write_lock
	)

Definition at line 5120 of file Execute.cpp.

References queries_interrupt_flag_, and queries_session_map_.

Referenced by enrollQuerySession().

                                                          {
   // an internal API that enrolls the query session into the Executor's session map
   if (queries_session_map_.count(query_session)) {
     if (queries_session_map_.at(query_session).count(submitted_time_str)) {
       queries_session_map_.at(query_session).erase(submitted_time_str);
       queries_session_map_.at(query_session)
           .emplace(submitted_time_str,
                    QuerySessionStatus(query_session,
                                       executor_id,
                                       query_str,
                                       submitted_time_str,
                                       query_status));
     } else {
       queries_session_map_.at(query_session)
           .emplace(submitted_time_str,
                    QuerySessionStatus(query_session,
                                       executor_id,
                                       query_str,
                                       submitted_time_str,
                                       query_status));
     }
   } else {
     std::map<std::string, QuerySessionStatus> executor_per_query_map;
     executor_per_query_map.emplace(
         submitted_time_str,
         QuerySessionStatus(
             query_session, executor_id, query_str, submitted_time_str, query_status));
     queries_session_map_.emplace(query_session, executor_per_query_map);
   }
   return queries_interrupt_flag_.emplace(query_session, false).second;
 }

Here is the caller graph for this function:

void Executor::addTransientStringLiterals	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const std::shared_ptr< RowSetMemoryOwner > &	row_set_mem_owner
	)

Definition at line 2523 of file Execute.cpp.

References CHECK, getStringDictionaryProxy(), RelAlgExecutionUnit::groupby_exprs, kENCODING_DICT, kMODE, kSAMPLE, kSINGLE_VALUE, RelAlgExecutionUnit::quals, RelAlgExecutionUnit::simple_quals, RelAlgExecutionUnit::target_exprs, RelAlgExecutionUnit::target_exprs_union, and ScalarExprVisitor< T >::visit().

                                                                {
   TransientDictIdVisitor dict_id_visitor;
 
   auto visit_expr =
       [this, &dict_id_visitor, &row_set_mem_owner](const Analyzer::Expr* expr) {
         if (!expr) {
           return;
         }
         const auto& dict_key = dict_id_visitor.visit(expr);
         if (dict_key.dict_id >= 0) {
           auto sdp = getStringDictionaryProxy(dict_key, row_set_mem_owner, true);
           CHECK(sdp);
           TransientStringLiteralsVisitor visitor(sdp, this);
           visitor.visit(expr);
         }
       };
 
   for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
     visit_expr(group_expr.get());
   }
 
   for (const auto& group_expr : ra_exe_unit.quals) {
     visit_expr(group_expr.get());
   }
 
   for (const auto& group_expr : ra_exe_unit.simple_quals) {
     visit_expr(group_expr.get());
   }
 
   const auto visit_target_expr = [&](const Analyzer::Expr* target_expr) {
     const auto& target_type = target_expr->get_type_info();
     if (!target_type.is_string() || target_type.get_compression() == kENCODING_DICT) {
       const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
       if (agg_expr) {
         // The following agg types require taking into account transient string values
         if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kSINGLE_VALUE ||
             agg_expr->get_aggtype() == kSAMPLE || agg_expr->get_aggtype() == kMODE) {
           visit_expr(agg_expr->get_arg());
         }
       } else {
         visit_expr(target_expr);
       }
     }
   };
   const auto& target_exprs = ra_exe_unit.target_exprs;
   std::for_each(target_exprs.begin(), target_exprs.end(), visit_target_expr);
   const auto& target_exprs_union = ra_exe_unit.target_exprs_union;
   std::for_each(target_exprs_union.begin(), target_exprs_union.end(), visit_target_expr);
 }

Here is the call graph for this function:

void Executor::addUdfIrToModule	(	const std::string &	udf_ir_filename,
		const bool	is_cuda_ir
	)

static

Definition at line 1956 of file NativeCodegen.cpp.

Referenced by DBHandler::initialize().

                                                        {
   Executor::extension_module_sources[is_cuda_ir
                                          ? Executor::ExtModuleKinds::udf_gpu_module
                                          : Executor::ExtModuleKinds::udf_cpu_module] =
       udf_ir_filename;
 }

Here is the caller graph for this function:

llvm::Value * Executor::aggregateWindowStatePtr	(	CodeGenerator *	code_generator,
		const CompilationOptions &	co
	)

private

Definition at line 242 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, CodegenUtil::createPtrWithHoistedMemoryAddr(), anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kFLOAT, and WindowFunctionContext::NUM_EXECUTION_DEVICES.

                                                                              {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto window_func_context =
       WindowProjectNodeContext::getActiveWindowFunctionContext(this);
   const auto window_func = window_func_context->getWindowFunction();
   const auto arg_ti = get_adjusted_window_type_info(window_func);
   llvm::Type* aggregate_state_type =
       arg_ti.get_type() == kFLOAT
           ? llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0)
           : llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
   const auto aggregate_state_i64 = cgen_state_->llInt(
       reinterpret_cast<const int64_t>(window_func_context->aggregateState()));
   return CodegenUtil::createPtrWithHoistedMemoryAddr(
              cgen_state_.get(),
              code_generator,
              co,
              aggregate_state_i64,
              aggregate_state_type,
              WindowFunctionContext::NUM_EXECUTION_DEVICES)
       .front();
 }

Here is the call graph for this function:

static size_t Executor::align	(	const size_t	off_in,
		const size_t	alignment
	)

inlinestaticprivate

Definition at line 1468 of file Execute.h.

Referenced by serializeLiterals().

                                                                    {
     size_t off = off_in;
     if (off % alignment != 0) {
       off += (alignment - off % alignment);
     }
     return off;
   }

Here is the caller graph for this function:

CurrentQueryStatus Executor::attachExecutorToQuerySession	(	const QuerySessionId &	query_session_id,
		const std::string &	query_str,
		const std::string &	query_submitted_time
	)

Definition at line 5018 of file Execute.cpp.

References executor_id_, executor_session_mutex_, updateQuerySessionExecutorAssignment(), and updateQuerySessionStatusWithLock().

                                            {
   if (!query_session_id.empty()) {
     // if session is valid, do update 1) the exact executor id and 2) query status
     heavyai::unique_lock<heavyai::shared_mutex> write_lock(executor_session_mutex_);
     updateQuerySessionExecutorAssignment(
         query_session_id, query_submitted_time, executor_id_, write_lock);
     updateQuerySessionStatusWithLock(query_session_id,
                                      query_submitted_time,
                                      QuerySessionStatus::QueryStatus::PENDING_EXECUTOR,
                                      write_lock);
   }
   return {query_session_id, query_str};
 }

Here is the call graph for this function:

void Executor::AutoTrackBuffersInRuntimeIR ( )

private

Definition at line 2303 of file NativeCodegen.cpp.

                                            {
   llvm::Module* M = cgen_state_->module_;
   if (M->getFunction("allocate_varlen_buffer") == nullptr) {
     return;
   }
 
   // read metadata
   bool should_track = false;
   auto* flag = M->getModuleFlag("manage_memory_buffer");
   if (auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
     if (cnt->getZExtValue() == 1) {
       should_track = true;
     }
   }
 
   if (!should_track) {
     // metadata is not present
     return;
   }
 
   LOG(INFO) << "Found 'manage_memory_buffer' metadata.";
   llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
 
   for (llvm::Function& F : *M) {
     for (llvm::BasicBlock& BB : F) {
       for (llvm::Instruction& I : BB) {
         if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
           // Keep track of calls to "allocate_varlen_buffer" for later processing
           auto const called_func_name = CodegenUtil::getCalledFunctionName(*CI);
           if (called_func_name && *called_func_name == "allocate_varlen_buffer") {
             calls_to_analyze.push_back(CI);
           }
         }
       }
     }
   }
 
   // for each call to "allocate_varlen_buffer", check if there's a corresponding
   // call to "register_buffer_with_executor_rsm". If not, add a call to it
   llvm::IRBuilder<> Builder(cgen_state_->context_);
   auto i64 = get_int_type(64, cgen_state_->context_);
   auto i8p = get_int_ptr_type(8, cgen_state_->context_);
   auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
   llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p}, false);
   llvm::FunctionCallee register_buffer_fn =
       M->getOrInsertFunction("register_buffer_with_executor_rsm", fnty, {});
 
   int64_t executor_addr = reinterpret_cast<int64_t>(this);
   for (llvm::CallInst* CI : calls_to_analyze) {
     bool found = false;
     // for each user of the function, check if its a callinst
     // and if the callinst is calling "register_buffer_with_executor_rsm"
     // if no such instruction exist, add one registering the buffer
     for (llvm::User* U : CI->users()) {
       if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
         auto const func_name = CodegenUtil::getCalledFunctionName(*call);
         if (func_name && *func_name == "register_buffer_with_executor_rsm") {
           found = true;
           break;
         }
       }
     }
     if (!found) {
       Builder.SetInsertPoint(CI->getNextNode());
       Builder.CreateCall(register_buffer_fn,
                          {ll_int(executor_addr, cgen_state_->context_), CI});
     }
   }
 }

unsigned Executor::blockSize ( ) const

Definition at line 4366 of file Execute.cpp.

References block_size_x_, CHECK, data_mgr_, CudaMgr_Namespace::CudaMgr::getAllDeviceProperties(), and Data_Namespace::DataMgr::getCudaMgr().

Referenced by collectAllDeviceShardedTopResults(), executePlanWithGroupBy(), executePlanWithoutGroupBy(), executeTableFunction(), executeWorkUnitImpl(), reduceMultiDeviceResults(), reduceMultiDeviceResultSets(), and resultsUnion().

                                    {
   CHECK(data_mgr_);
   const auto cuda_mgr = data_mgr_->getCudaMgr();
   if (!cuda_mgr) {
     return 0;
   }
   const auto& dev_props = cuda_mgr->getAllDeviceProperties();
   return block_size_x_ ? block_size_x_ : dev_props.front().maxThreadsPerBlock;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

std::shared_ptr< HashJoin > Executor::buildCurrentLevelHashTable	(	const JoinCondition &	current_level_join_conditions,
		size_t	level_idx,
		RelAlgExecutionUnit &	ra_exe_unit,
		const CompilationOptions &	co,
		const std::vector< InputTableInfo > &	query_infos,
		ColumnCacheMap &	column_cache,
		std::vector< std::string > &	fail_reasons
	)

private

Definition at line 1027 of file IRCodegen.cpp.

References anonymous_namespace{IRCodegen.cpp}::add_qualifier_to_execution_unit(), AUTOMATIC_IR_METADATA, anonymous_namespace{IRCodegen.cpp}::check_valid_join_qual(), Data_Namespace::CPU_LEVEL, CompilationOptions::device_type, Executor::JoinHashTableOrError::fail_reason, GPU, Data_Namespace::GPU_LEVEL, Executor::JoinHashTableOrError::hash_table, RelAlgExecutionUnit::hash_table_build_plan_dag, IS_EQUIVALENCE, LEFT, OneToOne, JoinCondition::quals, RelAlgExecutionUnit::query_hint, RelAlgExecutionUnit::table_id_to_node_map, JoinCondition::type, and VLOG.

                                         {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   std::shared_ptr<HashJoin> current_level_hash_table;
   auto handleNonHashtableQual = [&ra_exe_unit, &level_idx, this](
                                     JoinType join_type,
                                     std::shared_ptr<Analyzer::Expr> qual) {
     if (join_type == JoinType::LEFT) {
       plan_state_->addNonHashtableQualForLeftJoin(level_idx, qual);
     } else {
       add_qualifier_to_execution_unit(ra_exe_unit, qual);
     }
   };
   for (const auto& join_qual : current_level_join_conditions.quals) {
     auto qual_bin_oper = std::dynamic_pointer_cast<Analyzer::BinOper>(join_qual);
     if (current_level_hash_table || !qual_bin_oper ||
         !IS_EQUIVALENCE(qual_bin_oper->get_optype())) {
       handleNonHashtableQual(current_level_join_conditions.type, join_qual);
       if (!current_level_hash_table) {
         fail_reasons.emplace_back("No equijoin expression found");
       }
       continue;
     }
     check_valid_join_qual(qual_bin_oper);
     JoinHashTableOrError hash_table_or_error;
     if (!current_level_hash_table) {
       hash_table_or_error = buildHashTableForQualifier(
           qual_bin_oper,
           query_infos,
           co.device_type == ExecutorDeviceType::GPU ? MemoryLevel::GPU_LEVEL
                                                     : MemoryLevel::CPU_LEVEL,
           current_level_join_conditions.type,
           HashType::OneToOne,
           column_cache,
           ra_exe_unit.hash_table_build_plan_dag,
           ra_exe_unit.query_hint,
           ra_exe_unit.table_id_to_node_map);
       current_level_hash_table = hash_table_or_error.hash_table;
     }
     if (hash_table_or_error.hash_table) {
       plan_state_->join_info_.join_hash_tables_.push_back(hash_table_or_error.hash_table);
       plan_state_->join_info_.equi_join_tautologies_.push_back(qual_bin_oper);
     } else {
       fail_reasons.push_back(hash_table_or_error.fail_reason);
       if (!current_level_hash_table) {
         VLOG(2) << "Building a hashtable based on a qual " << qual_bin_oper->toString()
                 << " fails: " << hash_table_or_error.fail_reason;
       }
       handleNonHashtableQual(current_level_join_conditions.type, qual_bin_oper);
     }
   }
   return current_level_hash_table;
 }

Here is the call graph for this function:

Executor::JoinHashTableOrError Executor::buildHashTableForQualifier	(	const std::shared_ptr< Analyzer::BinOper > &	qual_bin_oper,
		const std::vector< InputTableInfo > &	query_infos,
		const MemoryLevel	memory_level,
		const JoinType	join_type,
		const HashType	preferred_hash_type,
		ColumnCacheMap &	column_cache,
		const HashTableBuildDagMap &	hashtable_build_dag_map,
		const RegisteredQueryHint &	query_hint,
		const TableIdToNodeMap &	table_id_to_node_map
	)

private

Definition at line 4309 of file Execute.cpp.

References deviceCountForMemoryLevel(), g_enable_bbox_intersect_hashjoin, g_enable_dynamic_watchdog, HashJoin::getInstance(), and interrupted_.

                                                   {
   if (!g_enable_bbox_intersect_hashjoin && qual_bin_oper->is_bbox_intersect_oper()) {
     return {nullptr,
             "Bounding box intersection disabled, attempting to fall back to loop join"};
   }
   if (g_enable_dynamic_watchdog && interrupted_.load()) {
     throw QueryExecutionError(ErrorCode::INTERRUPTED);
   }
   try {
     auto tbl = HashJoin::getInstance(qual_bin_oper,
                                      query_infos,
                                      memory_level,
                                      join_type,
                                      preferred_hash_type,
                                      deviceCountForMemoryLevel(memory_level),
                                      column_cache,
                                      this,
                                      hashtable_build_dag_map,
                                      query_hint,
                                      table_id_to_node_map);
     return {tbl, ""};
   } catch (const HashJoinFail& e) {
     return {nullptr, e.what()};
   }
 }

Here is the call graph for this function:

JoinLoop::HoistedFiltersCallback Executor::buildHoistLeftHandSideFiltersCb	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const size_t	level_idx,
		const shared::TableKey &	inner_table_key,
		const CompilationOptions &	co
	)

private

Definition at line 859 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CodeGenerator::codegen(), g_enable_left_join_filter_hoisting, RelAlgExecutionUnit::join_quals, LEFT, RelAlgExecutionUnit::quals, RelAlgExecutionUnit::simple_quals, CodeGenerator::toBool(), and VLOG.

                                   {
   if (!g_enable_left_join_filter_hoisting) {
     return nullptr;
   }
 
   const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
   if (level_idx == 0 && current_level_join_conditions.type == JoinType::LEFT) {
     const auto& condition = current_level_join_conditions.quals.front();
     const auto bin_oper = dynamic_cast<const Analyzer::BinOper*>(condition.get());
     CHECK(bin_oper) << condition->toString();
     const auto rhs =
         dynamic_cast<const Analyzer::ColumnVar*>(bin_oper->get_right_operand());
     const auto lhs =
         dynamic_cast<const Analyzer::ColumnVar*>(bin_oper->get_left_operand());
     if (lhs && rhs && lhs->getTableKey() != rhs->getTableKey()) {
       const Analyzer::ColumnVar* selected_lhs{nullptr};
       // grab the left hand side column -- this is somewhat similar to normalize column
       // pair, and a better solution may be to hoist that function out of the join
       // framework and normalize columns at the top of build join loops
       if (lhs->getTableKey() == inner_table_id) {
         selected_lhs = rhs;
       } else if (rhs->getTableKey() == inner_table_id) {
         selected_lhs = lhs;
       }
       if (selected_lhs) {
         std::list<std::shared_ptr<Analyzer::Expr>> hoisted_quals;
         // get all LHS-only filters
         auto should_hoist_qual = [&hoisted_quals](const auto& qual,
                                                   const shared::TableKey& table_key) {
           CHECK(qual);
 
           ExprTableIdVisitor visitor;
           const auto table_keys = visitor.visit(qual.get());
           if (table_keys.size() == 1 && table_keys.find(table_key) != table_keys.end()) {
             hoisted_quals.push_back(qual);
           }
         };
         for (const auto& qual : ra_exe_unit.simple_quals) {
           should_hoist_qual(qual, selected_lhs->getTableKey());
         }
         for (const auto& qual : ra_exe_unit.quals) {
           should_hoist_qual(qual, selected_lhs->getTableKey());
         }
 
         // build the filters callback and return it
         if (!hoisted_quals.empty()) {
           return [this, hoisted_quals, co](llvm::BasicBlock* true_bb,
                                            llvm::BasicBlock* exit_bb,
                                            const std::string& loop_name,
                                            llvm::Function* parent_func,
                                            CgenState* cgen_state) -> llvm::BasicBlock* {
             // make sure we have quals to hoist
             bool has_quals_to_hoist = false;
             for (const auto& qual : hoisted_quals) {
               // check to see if the filter was previously hoisted. if all filters were
               // previously hoisted, this callback becomes a noop
               if (plan_state_->hoisted_filters_.count(qual) == 0) {
                 has_quals_to_hoist = true;
                 break;
               }
             }
 
             if (!has_quals_to_hoist) {
               return nullptr;
             }
 
             AUTOMATIC_IR_METADATA(cgen_state);
 
             llvm::IRBuilder<>& builder = cgen_state->ir_builder_;
             auto& context = builder.getContext();
 
             const auto filter_bb =
                 llvm::BasicBlock::Create(context,
                                          "hoisted_left_join_filters_" + loop_name,
                                          parent_func,
                                          /*insert_before=*/true_bb);
             builder.SetInsertPoint(filter_bb);
 
             llvm::Value* filter_lv = cgen_state_->llBool(true);
             CodeGenerator code_generator(this);
             CHECK(plan_state_);
             for (const auto& qual : hoisted_quals) {
               if (plan_state_->hoisted_filters_.insert(qual).second) {
                 // qual was inserted into the hoisted filters map, which means we have not
                 // seen this qual before. Generate filter.
                 VLOG(1) << "Generating code for hoisted left hand side qualifier "
                         << qual->toString();
                 auto cond = code_generator.toBool(
                     code_generator.codegen(qual.get(), true, co).front());
                 filter_lv = builder.CreateAnd(filter_lv, cond);
               }
             }
             CHECK(filter_lv->getType()->isIntegerTy(1));
 
             builder.CreateCondBr(filter_lv, true_bb, exit_bb);
             return filter_bb;
           };
         }
       }
     }
   }
   return nullptr;
 }

Here is the call graph for this function:

std::function< llvm::Value (const std::vector< llvm::Value > &, llvm::Value *)> Executor::buildIsDeletedCb	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const size_t	level_idx,
		const CompilationOptions &	co
	)

private

Definition at line 968 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_LT, CodeGenerator::codegen(), CompilationOptions::filter_on_deleted_column, RelAlgExecutionUnit::input_descs, TABLE, and CodeGenerator::toBool().

                                                          {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   if (!co.filter_on_deleted_column) {
     return nullptr;
   }
   CHECK_LT(level_idx + 1, ra_exe_unit.input_descs.size());
   const auto input_desc = ra_exe_unit.input_descs[level_idx + 1];
   if (input_desc.getSourceType() != InputSourceType::TABLE) {
     return nullptr;
   }
 
   const auto deleted_cd = plan_state_->getDeletedColForTable(input_desc.getTableKey());
   if (!deleted_cd) {
     return nullptr;
   }
   CHECK(deleted_cd->columnType.is_boolean());
   const auto deleted_expr = makeExpr<Analyzer::ColumnVar>(
       deleted_cd->columnType,
       shared::ColumnKey{input_desc.getTableKey(), deleted_cd->columnId},
       input_desc.getNestLevel());
   return [this, deleted_expr, level_idx, &co](const std::vector<llvm::Value*>& prev_iters,
                                               llvm::Value* have_more_inner_rows) {
     const auto matching_row_index = addJoinLoopIterator(prev_iters, level_idx + 1);
     // Avoid fetching the deleted column from a position which is not valid.
     // An invalid position can be returned by a one to one hash lookup (negative)
     // or at the end of iteration over a set of matching values.
     llvm::Value* is_valid_it{nullptr};
     if (have_more_inner_rows) {
       is_valid_it = have_more_inner_rows;
     } else {
       is_valid_it = cgen_state_->ir_builder_.CreateICmp(
           llvm::ICmpInst::ICMP_SGE, matching_row_index, cgen_state_->llInt<int64_t>(0));
     }
     const auto it_valid_bb = llvm::BasicBlock::Create(
         cgen_state_->context_, "it_valid", cgen_state_->current_func_);
     const auto it_not_valid_bb = llvm::BasicBlock::Create(
         cgen_state_->context_, "it_not_valid", cgen_state_->current_func_);
     cgen_state_->ir_builder_.CreateCondBr(is_valid_it, it_valid_bb, it_not_valid_bb);
     const auto row_is_deleted_bb = llvm::BasicBlock::Create(
         cgen_state_->context_, "row_is_deleted", cgen_state_->current_func_);
     cgen_state_->ir_builder_.SetInsertPoint(it_valid_bb);
     CodeGenerator code_generator(this);
     const auto row_is_deleted = code_generator.toBool(
         code_generator.codegen(deleted_expr.get(), true, co).front());
     cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
     cgen_state_->ir_builder_.SetInsertPoint(it_not_valid_bb);
     const auto row_is_deleted_default = cgen_state_->llBool(false);
     cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
     cgen_state_->ir_builder_.SetInsertPoint(row_is_deleted_bb);
     auto row_is_deleted_or_default =
         cgen_state_->ir_builder_.CreatePHI(row_is_deleted->getType(), 2);
     row_is_deleted_or_default->addIncoming(row_is_deleted, it_valid_bb);
     row_is_deleted_or_default->addIncoming(row_is_deleted_default, it_not_valid_bb);
     return row_is_deleted_or_default;
   };
 }

Here is the call graph for this function:

std::vector< JoinLoop > Executor::buildJoinLoops	(	RelAlgExecutionUnit &	ra_exe_unit,
		const CompilationOptions &	co,
		const ExecutionOptions &	eo,
		const std::vector< InputTableInfo > &	query_infos,
		ColumnCacheMap &	column_cache
	)

private

Definition at line 610 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_LT, CodeGenerator::codegen(), INJECT_TIMER, CgenState::ir_builder_, RelAlgExecutionUnit::join_quals, LEFT, PlanState::left_join_non_hashtable_quals_, CgenState::llBool(), MultiSet, OneToOne, CgenState::outer_join_match_found_per_level_, CodeGenerator::plan_state_, Set, Singleton, JoinLoopDomain::slot_lookup_result, CodeGenerator::toBool(), and JoinLoopDomain::values_buffer.

                                   {
   INJECT_TIMER(buildJoinLoops);
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   std::vector<JoinLoop> join_loops;
   for (size_t level_idx = 0, current_hash_table_idx = 0;
        level_idx < ra_exe_unit.join_quals.size();
        ++level_idx) {
     const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
     std::vector<std::string> fail_reasons;
     const auto current_level_hash_table =
         buildCurrentLevelHashTable(current_level_join_conditions,
                                    level_idx,
                                    ra_exe_unit,
                                    co,
                                    query_infos,
                                    column_cache,
                                    fail_reasons);
     const auto found_outer_join_matches_cb =
         [this, level_idx](llvm::Value* found_outer_join_matches) {
           CHECK_LT(level_idx, cgen_state_->outer_join_match_found_per_level_.size());
           CHECK(!cgen_state_->outer_join_match_found_per_level_[level_idx]);
           cgen_state_->outer_join_match_found_per_level_[level_idx] =
               found_outer_join_matches;
         };
     const auto is_deleted_cb = buildIsDeletedCb(ra_exe_unit, level_idx, co);
     auto rem_left_join_quals_it =
         plan_state_->left_join_non_hashtable_quals_.find(level_idx);
     bool has_remaining_left_join_quals =
         rem_left_join_quals_it != plan_state_->left_join_non_hashtable_quals_.end() &&
         !rem_left_join_quals_it->second.empty();
     const auto outer_join_condition_remaining_quals_cb =
         [this, level_idx, &co](const std::vector<llvm::Value*>& prev_iters) {
           // when we have multiple quals for the left join in the current join level
           // we first try to build a hashtable by using one of the possible qual,
           // and deal with remaining quals as extra join conditions
           FetchCacheAnchor anchor(cgen_state_.get());
           addJoinLoopIterator(prev_iters, level_idx + 1);
           llvm::Value* left_join_cond = cgen_state_->llBool(true);
           CodeGenerator code_generator(this);
           auto it = plan_state_->left_join_non_hashtable_quals_.find(level_idx);
           if (it != plan_state_->left_join_non_hashtable_quals_.end()) {
             for (auto expr : it->second) {
               left_join_cond = cgen_state_->ir_builder_.CreateAnd(
                   left_join_cond,
                   code_generator.toBool(
                       code_generator.codegen(expr.get(), true, co).front()));
             }
           }
           return left_join_cond;
         };
     if (current_level_hash_table) {
       const auto hoisted_filters_cb = buildHoistLeftHandSideFiltersCb(
           ra_exe_unit, level_idx, current_level_hash_table->getInnerTableId(), co);
       if (current_level_hash_table->getHashType() == HashType::OneToOne) {
         join_loops.emplace_back(
             /*kind=*/JoinLoopKind::Singleton,
             /*type=*/current_level_join_conditions.type,
             /*iteration_domain_codegen=*/
             [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
                 const std::vector<llvm::Value*>& prev_iters) {
               addJoinLoopIterator(prev_iters, level_idx);
               JoinLoopDomain domain{{0}};
               domain.slot_lookup_result =
                   current_level_hash_table->codegenSlot(co, current_hash_table_idx);
               return domain;
             },
             /*outer_condition_match=*/
             current_level_join_conditions.type == JoinType::LEFT &&
                     has_remaining_left_join_quals
                 ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
                       outer_join_condition_remaining_quals_cb)
                 : nullptr,
             /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
                 ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
                 : nullptr,
             /*hoisted_filters=*/hoisted_filters_cb,
             /*is_deleted=*/is_deleted_cb,
             /*nested_loop_join=*/false);
       } else if (auto range_join_table =
                      dynamic_cast<RangeJoinHashTable*>(current_level_hash_table.get())) {
         join_loops.emplace_back(
             /* kind= */ JoinLoopKind::MultiSet,
             /* type= */ current_level_join_conditions.type,
             /* iteration_domain_codegen= */
             [this,
              range_join_table,
              current_hash_table_idx,
              level_idx,
              current_level_hash_table,
              &co](const std::vector<llvm::Value*>& prev_iters) {
               addJoinLoopIterator(prev_iters, level_idx);
               JoinLoopDomain domain{{0}};
               CHECK(!prev_iters.empty());
               const auto matching_set = range_join_table->codegenMatchingSetWithOffset(
                   co, current_hash_table_idx, prev_iters.back());
               domain.values_buffer = matching_set.elements;
               domain.element_count = matching_set.count;
               return domain;
             },
             /* outer_condition_match= */
             current_level_join_conditions.type == JoinType::LEFT
                 ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
                       outer_join_condition_remaining_quals_cb)
                 : nullptr,
             /* found_outer_matches= */
             current_level_join_conditions.type == JoinType::LEFT
                 ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
                 : nullptr,
             /* hoisted_filters= */ nullptr,  // <<! TODO
             /* is_deleted= */ is_deleted_cb,
             /*nested_loop_join=*/false);
       } else {
         join_loops.emplace_back(
             /*kind=*/JoinLoopKind::Set,
             /*type=*/current_level_join_conditions.type,
             /*iteration_domain_codegen=*/
             [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
                 const std::vector<llvm::Value*>& prev_iters) {
               addJoinLoopIterator(prev_iters, level_idx);
               JoinLoopDomain domain{{0}};
               const auto matching_set = current_level_hash_table->codegenMatchingSet(
                   co, current_hash_table_idx);
               domain.values_buffer = matching_set.elements;
               domain.element_count = matching_set.count;
               domain.error_code = matching_set.error_code;
               return domain;
             },
             /*outer_condition_match=*/
             current_level_join_conditions.type == JoinType::LEFT
                 ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
                       outer_join_condition_remaining_quals_cb)
                 : nullptr,
             /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
                 ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
                 : nullptr,
             /*hoisted_filters=*/hoisted_filters_cb,
             /*is_deleted=*/is_deleted_cb,
             /*nested_loop_join=*/false);
       }
       ++current_hash_table_idx;
     } else {
       const auto fail_reasons_str = current_level_join_conditions.quals.empty()
                                         ? "No equijoin expression found"
                                         : boost::algorithm::join(fail_reasons, " | ");
       check_if_loop_join_is_allowed(
           ra_exe_unit, eo, query_infos, level_idx, fail_reasons_str);
       // Callback provided to the `JoinLoop` framework to evaluate the (outer) join
       // condition.
       VLOG(1) << "Unable to build hash table, falling back to loop join: "
               << fail_reasons_str;
       const auto outer_join_condition_cb =
           [this, level_idx, &co, &current_level_join_conditions](
               const std::vector<llvm::Value*>& prev_iters) {
             // The values generated for the match path don't dominate all uses
             // since on the non-match path nulls are generated. Reset the cache
             // once the condition is generated to avoid incorrect reuse.
             FetchCacheAnchor anchor(cgen_state_.get());
             addJoinLoopIterator(prev_iters, level_idx + 1);
             llvm::Value* left_join_cond = cgen_state_->llBool(true);
             CodeGenerator code_generator(this);
             for (auto expr : current_level_join_conditions.quals) {
               left_join_cond = cgen_state_->ir_builder_.CreateAnd(
                   left_join_cond,
                   code_generator.toBool(
                       code_generator.codegen(expr.get(), true, co).front()));
             }
             return left_join_cond;
           };
       join_loops.emplace_back(
           /*kind=*/JoinLoopKind::UpperBound,
           /*type=*/current_level_join_conditions.type,
           /*iteration_domain_codegen=*/
           [this, level_idx](const std::vector<llvm::Value*>& prev_iters) {
             addJoinLoopIterator(prev_iters, level_idx);
             JoinLoopDomain domain{{0}};
             auto* arg = get_arg_by_name(cgen_state_->row_func_, "num_rows_per_scan");
             const auto rows_per_scan_ptr = cgen_state_->ir_builder_.CreateGEP(
                 arg->getType()->getScalarType()->getPointerElementType(),
                 arg,
                 cgen_state_->llInt(int32_t(level_idx + 1)));
             domain.upper_bound = cgen_state_->ir_builder_.CreateLoad(
                 rows_per_scan_ptr->getType()->getPointerElementType(),
                 rows_per_scan_ptr,
                 "num_rows_per_scan");
             return domain;
           },
           /*outer_condition_match=*/
           current_level_join_conditions.type == JoinType::LEFT
               ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
                     outer_join_condition_cb)
               : nullptr,
           /*found_outer_matches=*/
           current_level_join_conditions.type == JoinType::LEFT
               ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
               : nullptr,
           /*hoisted_filters=*/nullptr,
           /*is_deleted=*/is_deleted_cb,
           /*nested_loop_join=*/true);
     }
   }
   return join_loops;
 }

Here is the call graph for this function:

void Executor::buildSelectedFragsMapping	(	std::vector< std::vector< size_t >> &	selected_fragments_crossjoin,
		std::vector< size_t > &	local_col_to_frag_pos,
		const std::list< std::shared_ptr< const InputColDescriptor >> &	col_global_ids,
		const FragmentsList &	selected_fragments,
		const RelAlgExecutionUnit &	ra_exe_unit
	)

private

Definition at line 3774 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LT, getFragmentCount(), RelAlgExecutionUnit::input_descs, and plan_state_.

Referenced by fetchChunks().

                                             {
   local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
   size_t frag_pos{0};
   const auto& input_descs = ra_exe_unit.input_descs;
   for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
     const auto& table_key = input_descs[scan_idx].getTableKey();
     CHECK_EQ(selected_fragments[scan_idx].table_key, table_key);
     selected_fragments_crossjoin.push_back(
         getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
     for (const auto& col_id : col_global_ids) {
       CHECK(col_id);
       const auto& input_desc = col_id->getScanDesc();
       if (input_desc.getTableKey() != table_key ||
           input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
         continue;
       }
       auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
       CHECK(it != plan_state_->global_to_local_col_ids_.end());
       CHECK_LT(static_cast<size_t>(it->second),
                plan_state_->global_to_local_col_ids_.size());
       local_col_to_frag_pos[it->second] = frag_pos;
     }
     ++frag_pos;
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::buildSelectedFragsMappingForUnion	(	std::vector< std::vector< size_t >> &	selected_fragments_crossjoin,
		const FragmentsList &	selected_fragments,
		const RelAlgExecutionUnit &	ra_exe_unit
	)

private

Definition at line 3805 of file Execute.cpp.

References RelAlgExecutionUnit::input_descs.

Referenced by fetchUnionChunks().

                                             {
   const auto& input_descs = ra_exe_unit.input_descs;
   for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
     // selected_fragments is set in assignFragsToKernelDispatch execution_kernel.fragments
     if (selected_fragments[0].table_key == input_descs[scan_idx].getTableKey()) {
       selected_fragments_crossjoin.push_back({size_t(1)});
     }
   }
 }

Here is the caller graph for this function:

FragmentSkipStatus Executor::canSkipFragmentForFpQual	(	const Analyzer::BinOper *	comp_expr,
		const Analyzer::ColumnVar *	lhs_col,
		const Fragmenter_Namespace::FragmentInfo &	fragment,
		const Analyzer::Constant *	rhs_const
	)		const

private

Definition at line 4598 of file Execute.cpp.

References CHECK, shared::ColumnKey::column_id, extract_max_stat_fp_type(), extract_min_stat_fp_type(), Analyzer::Constant::get_constval(), Analyzer::BinOper::get_optype(), SQLTypeInfo::get_type(), Analyzer::Expr::get_type_info(), Fragmenter_Namespace::FragmentInfo::getChunkMetadataMap(), Analyzer::ColumnVar::getColumnKey(), INVALID, kDOUBLE, kEQ, kFLOAT, kGE, kGT, kLE, kLT, NOT_SKIPPABLE, and SKIPPABLE.

Referenced by skipFragment().

                                              {
   auto col_id = lhs_col->getColumnKey().column_id;
   auto chunk_meta_it = fragment.getChunkMetadataMap().find(col_id);
   if (chunk_meta_it == fragment.getChunkMetadataMap().end()) {
     return FragmentSkipStatus::NOT_SKIPPABLE;
   }
   double chunk_min{0.};
   double chunk_max{0.};
   const auto& chunk_type = lhs_col->get_type_info();
   chunk_min = extract_min_stat_fp_type(chunk_meta_it->second->chunkStats, chunk_type);
   chunk_max = extract_max_stat_fp_type(chunk_meta_it->second->chunkStats, chunk_type);
   if (chunk_min > chunk_max) {
     return FragmentSkipStatus::INVALID;
   }
 
   const auto datum_fp = rhs_const->get_constval();
   const auto rhs_type = rhs_const->get_type_info().get_type();
   CHECK(rhs_type == kFLOAT || rhs_type == kDOUBLE);
 
   // Do we need to codegen the constant like the integer path does?
   const auto rhs_val = rhs_type == kFLOAT ? datum_fp.floatval : datum_fp.doubleval;
 
   // Todo: dedup the following comparison code with the integer/timestamp path, it is
   // slightly tricky due to do cleanly as we do not have rowid on this path
   switch (comp_expr->get_optype()) {
     case kGE:
       if (chunk_max < rhs_val) {
         return FragmentSkipStatus::SKIPPABLE;
       }
       break;
     case kGT:
       if (chunk_max <= rhs_val) {
         return FragmentSkipStatus::SKIPPABLE;
       }
       break;
     case kLE:
       if (chunk_min > rhs_val) {
         return FragmentSkipStatus::SKIPPABLE;
       }
       break;
     case kLT:
       if (chunk_min >= rhs_val) {
         return FragmentSkipStatus::SKIPPABLE;
       }
       break;
     case kEQ:
       if (chunk_min > rhs_val || chunk_max < rhs_val) {
         return FragmentSkipStatus::SKIPPABLE;
       }
       break;
     default:
       break;
   }
   return FragmentSkipStatus::NOT_SKIPPABLE;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

llvm::Value * Executor::castToFP	(	llvm::Value *	value,
		SQLTypeInfo const &	from_ti,
		SQLTypeInfo const &	to_ti
	)

private

Definition at line 4401 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, cgen_state_, exp_to_scale(), logger::FATAL, SQLTypeInfo::get_scale(), SQLTypeInfo::get_size(), SQLTypeInfo::is_fp(), SQLTypeInfo::is_number(), and LOG.

                                                           {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   if (value->getType()->isIntegerTy() && from_ti.is_number() && to_ti.is_fp() &&
       (!from_ti.is_fp() || from_ti.get_size() != to_ti.get_size())) {
     llvm::Type* fp_type{nullptr};
     switch (to_ti.get_size()) {
       case 4:
         fp_type = llvm::Type::getFloatTy(cgen_state_->context_);
         break;
       case 8:
         fp_type = llvm::Type::getDoubleTy(cgen_state_->context_);
         break;
       default:
         LOG(FATAL) << "Unsupported FP size: " << to_ti.get_size();
     }
     value = cgen_state_->ir_builder_.CreateSIToFP(value, fp_type);
     if (from_ti.get_scale()) {
       value = cgen_state_->ir_builder_.CreateFDiv(
           value,
           llvm::ConstantFP::get(value->getType(), exp_to_scale(from_ti.get_scale())));
     }
   }
   return value;
 }

Here is the call graph for this function:

llvm::Value * Executor::castToIntPtrTyIn	(	llvm::Value *	val,
		const size_t	bit_width
	)

private

Definition at line 4428 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, cgen_state_, CHECK, CHECK_LT, and get_int_type().

                                                                            {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   CHECK(val->getType()->isPointerTy());
 
   const auto val_ptr_type = static_cast<llvm::PointerType*>(val->getType());
   const auto val_type = val_ptr_type->getPointerElementType();
   size_t val_width = 0;
   if (val_type->isIntegerTy()) {
     val_width = val_type->getIntegerBitWidth();
   } else {
     if (val_type->isFloatTy()) {
       val_width = 32;
     } else {
       CHECK(val_type->isDoubleTy());
       val_width = 64;
     }
   }
   CHECK_LT(size_t(0), val_width);
   if (bitWidth == val_width) {
     return val;
   }
   return cgen_state_->ir_builder_.CreateBitCast(
       val, llvm::PointerType::get(get_int_type(bitWidth, cgen_state_->context_), 0));
 }

Here is the call graph for this function:

bool Executor::checkCurrentQuerySession	(	const std::string &	candidate_query_session,
		heavyai::shared_lock< heavyai::shared_mutex > &	read_lock
	)

Definition at line 4991 of file Execute.cpp.

References current_query_session_.

                                                         {
   // if current_query_session is equal to the candidate_query_session,
   // or it is empty session we consider
   return !candidate_query_session.empty() &&
          (current_query_session_ == candidate_query_session);
 }

bool Executor::checkIsQuerySessionEnrolled	(	const QuerySessionId &	query_session,
		heavyai::shared_lock< heavyai::shared_mutex > &	read_lock
	)

Definition at line 5265 of file Execute.cpp.

References queries_session_map_.

Referenced by executeWorkUnitImpl().

                                                         {
   if (query_session.empty()) {
     return false;
   }
   return !query_session.empty() && queries_session_map_.count(query_session);
 }

Here is the caller graph for this function:

bool Executor::checkIsQuerySessionInterrupted	(	const std::string &	query_session,
		heavyai::shared_lock< heavyai::shared_mutex > &	read_lock
	)

Definition at line 5254 of file Execute.cpp.

References queries_interrupt_flag_.

Referenced by executePlanWithGroupBy(), executePlanWithoutGroupBy(), fetchChunks(), and fetchUnionChunks().

                                                         {
   if (query_session.empty()) {
     return false;
   }
   auto flag_it = queries_interrupt_flag_.find(query_session);
   return !query_session.empty() && flag_it != queries_interrupt_flag_.end() &&
          flag_it->second;
 }

Here is the caller graph for this function:

bool Executor::checkNonKernelTimeInterrupted ( ) const

Definition at line 5363 of file Execute.cpp.

References current_query_session_, executor_id_, executor_session_mutex_, queries_interrupt_flag_, and UNITARY_EXECUTOR_ID.

                                                    {
   // this function should be called within an executor which is assigned
   // to the specific query thread (that indicates we already enroll the session)
   // check whether this is called from non unitary executor
   if (executor_id_ == UNITARY_EXECUTOR_ID) {
     return false;
   };
   heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(executor_session_mutex_);
   auto flag_it = queries_interrupt_flag_.find(current_query_session_);
   return !current_query_session_.empty() && flag_it != queries_interrupt_flag_.end() &&
          flag_it->second;
 }

void Executor::checkPendingQueryStatus ( const QuerySessionId & query_session )

Definition at line 5035 of file Execute.cpp.

References executor_session_mutex_, queries_interrupt_flag_, queries_session_map_, and VLOG.

                                                                           {
   // check whether we are okay to execute the "pending" query
   // i.e., before running the query check if this query session is "ALREADY" interrupted
   heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(executor_session_mutex_);
   if (query_session.empty()) {
     return;
   }
   if (queries_interrupt_flag_.find(query_session) == queries_interrupt_flag_.end()) {
     // something goes wrong since we assume this is caller's responsibility
     // (call this function only for enrolled query session)
     if (!queries_session_map_.count(query_session)) {
       VLOG(1) << "Interrupting pending query is not available since the query session is "
                  "not enrolled";
     } else {
       // here the query session is enrolled but the interrupt flag is not registered
       VLOG(1)
           << "Interrupting pending query is not available since its interrupt flag is "
              "not registered";
     }
     return;
   }
   if (queries_interrupt_flag_[query_session]) {
     throw QueryExecutionError(ErrorCode::INTERRUPTED);
   }
 }

void Executor::clearCaches ( bool runtime_only = false )

void Executor::clearCardinalityCache ( )

static

Definition at line 5309 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, and recycler_mutex_.

Referenced by clearExternalCaches().

                                      {
   if (g_use_estimator_result_cache) {
     heavyai::unique_lock<heavyai::shared_mutex> lock(recycler_mutex_);
     cardinality_cache_.clear();
   }
 }

Here is the caller graph for this function:

static void Executor::clearExternalCaches	(	bool	for_update,
		const TableDescriptor *	td,
		const int	current_db_id
	)

inlinestatic

Definition at line 438 of file Execute.h.

References clearCardinalityCache(), TableDescriptor::getTableChunkKey(), hash_value(), CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCaches(), CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCachesByTable(), invalidateCardinalityCacheForTable(), and TableDescriptor::tableId.

                                                            {
     bool clearEntireCache = true;
     if (td) {
       const auto& table_chunk_key_prefix = td->getTableChunkKey(current_db_id);
       if (!table_chunk_key_prefix.empty()) {
         auto table_key = boost::hash_value(table_chunk_key_prefix);
         ResultSetCacheInvalidator::invalidateCachesByTable(table_key);
         if (for_update) {
           UpdateTriggeredCacheInvalidator::invalidateCachesByTable(table_key);
         } else {
           DeleteTriggeredCacheInvalidator::invalidateCachesByTable(table_key);
         }
         Executor::invalidateCardinalityCacheForTable({current_db_id, td->tableId});
         clearEntireCache = false;
       }
     }
     if (clearEntireCache) {
       ResultSetCacheInvalidator::invalidateCaches();
       if (for_update) {
         UpdateTriggeredCacheInvalidator::invalidateCaches();
       } else {
         DeleteTriggeredCacheInvalidator::invalidateCaches();
       }
       Executor::clearCardinalityCache();
     }
   }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::clearMemory ( const Data_Namespace::MemoryLevel memory_level )

static

Definition at line 535 of file Execute.cpp.

References clearExternalCaches(), Data_Namespace::DataMgr::clearMemory(), Data_Namespace::CPU_LEVEL, execute_mutex_, Catalog_Namespace::SysCatalog::getDataMgr(), Data_Namespace::GPU_LEVEL, Catalog_Namespace::SysCatalog::instance(), and CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCaches().

Referenced by DBHandler::clear_cpu_memory(), DBHandler::clear_gpu_memory(), QueryRunner::QueryRunner::clearCpuMemory(), and QueryRunner::QueryRunner::clearGpuMemory().

                                                                        {
   switch (memory_level) {
     case Data_Namespace::MemoryLevel::CPU_LEVEL:
     case Data_Namespace::MemoryLevel::GPU_LEVEL: {
       heavyai::unique_lock<heavyai::shared_mutex> flush_lock(
           execute_mutex_);  // Don't flush memory while queries are running
 
       if (memory_level == Data_Namespace::MemoryLevel::CPU_LEVEL) {
         // The hash table cache uses CPU memory not managed by the buffer manager. In the
         // future, we should manage these allocations with the buffer manager directly.
         // For now, assume the user wants to purge the hash table cache when they clear
         // CPU memory (currently used in ExecuteTest to lower memory pressure)
         // TODO: Move JoinHashTableCacheInvalidator to Executor::clearExternalCaches();
         JoinHashTableCacheInvalidator::invalidateCaches();
       }
       Executor::clearExternalCaches(true, nullptr, 0);
       Catalog_Namespace::SysCatalog::instance().getDataMgr().clearMemory(memory_level);
       break;
     }
     default: {
       throw std::runtime_error(
           "Clearing memory levels other than the CPU level or GPU level is not "
           "supported.");
     }
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::clearMetaInfoCache ( )

private

Definition at line 1054 of file Execute.cpp.

References agg_col_range_cache_, TableGenerations::clear(), AggregatedColRange::clear(), InputTableInfoCache::clear(), input_table_info_cache_, and table_generations_.

                                   {
   input_table_info_cache_.clear();
   agg_col_range_cache_.clear();
   table_generations_.clear();
 }

Here is the call graph for this function:

void Executor::clearQuerySessionStatus	(	const QuerySessionId &	query_session,
		const std::string &	submitted_time_str
	)

Definition at line 5061 of file Execute.cpp.

References current_query_session_, executor_session_mutex_, invalidateRunningQuerySession(), removeFromQuerySessionList(), and resetInterrupt().

                                                                             {
   heavyai::unique_lock<heavyai::shared_mutex> session_write_lock(executor_session_mutex_);
   // clear the interrupt-related info for a finished query
   if (query_session.empty()) {
     return;
   }
   removeFromQuerySessionList(query_session, submitted_time_str, session_write_lock);
   if (query_session.compare(current_query_session_) == 0) {
     invalidateRunningQuerySession(session_write_lock);
     resetInterrupt();
   }
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenAggregateWindowState	(	CodeGenerator *	code_generator,
		const CompilationOptions &	co,
		llvm::Value *	aggregate_state
	)

private

Definition at line 1510 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, COUNT, CodegenUtil::createPtrWithHoistedMemoryAddr(), anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), Analyzer::WindowFunction::getKind(), kDECIMAL, kDOUBLE, kFLOAT, and WindowFunctionContext::NUM_EXECUTION_DEVICES.

                                                                                {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto pi32_type =
       llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
   const auto pi64_type =
       llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
   const auto window_func_context =
       WindowProjectNodeContext::getActiveWindowFunctionContext(this);
   const Analyzer::WindowFunction* window_func = window_func_context->getWindowFunction();
   const auto window_func_ti = get_adjusted_window_type_info(window_func);
   const auto aggregate_state_type =
       window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
   if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
     const auto aggregate_state_count_i64 = cgen_state_->llInt(
         reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
     auto aggregate_state_count = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                      cgen_state_.get(),
                                      code_generator,
                                      co,
                                      aggregate_state_count_i64,
                                      aggregate_state_type,
                                      WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                      .front();
     const auto double_null_lv = cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE));
     switch (window_func_ti.get_type()) {
       case kFLOAT: {
         return cgen_state_->emitCall(
             "load_avg_float", {aggregate_state, aggregate_state_count, double_null_lv});
       }
       case kDOUBLE: {
         return cgen_state_->emitCall(
             "load_avg_double", {aggregate_state, aggregate_state_count, double_null_lv});
       }
       case kDECIMAL: {
         return cgen_state_->emitCall(
             "load_avg_decimal",
             {aggregate_state,
              aggregate_state_count,
              double_null_lv,
              cgen_state_->llInt<int32_t>(window_func_ti.get_scale())});
       }
       default: {
         return cgen_state_->emitCall(
             "load_avg_int", {aggregate_state, aggregate_state_count, double_null_lv});
       }
     }
   }
   if (window_func->getKind() == SqlWindowFunctionKind::COUNT) {
     return cgen_state_->ir_builder_.CreateLoad(
         aggregate_state->getType()->getPointerElementType(), aggregate_state);
   }
   switch (window_func_ti.get_type()) {
     case kFLOAT: {
       return cgen_state_->emitCall("load_float", {aggregate_state});
     }
     case kDOUBLE: {
       return cgen_state_->emitCall("load_double", {aggregate_state});
     }
     default: {
       return cgen_state_->ir_builder_.CreateLoad(
           aggregate_state->getType()->getPointerElementType(), aggregate_state);
     }
   }
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenConditionalAggregateCondValSelector	(	llvm::Value *	cond_lv,
		SQLAgg const	aggKind,
		CompilationOptions const &	co
	)		const

private

Definition at line 1577 of file WindowFunctionIR.cpp.

References CHECK, and kSUM_IF.

                                         {
   llvm::Value* res_cond_lv{nullptr};
   switch (aggKind) {
     case kSUM_IF:
       if (cond_lv->getType()->isIntegerTy(1)) {
         // cond_expr returns i1 type val, just need to cast to i8 type
         // i.e., cond_expr IS NULL
         res_cond_lv = cgen_state_->castToTypeIn(cond_lv, 8);
       } else {
         CHECK(cond_lv->getType()->isIntegerTy(8));
         // cond_expr may have null value instead of upcasted bool (i1-type) value
         // so we have to correctly set true condition
         // i.e., i8 @gt_int32_t_nullable_lhs(..., i64 -2147483648, i8 -128)
         // has one of the following i8-type values: 1, 0, -128
         auto true_cond_lv =
             cgen_state_->ir_builder_.CreateICmpEQ(cond_lv, cgen_state_->llInt((int8_t)1));
         res_cond_lv = cgen_state_->ir_builder_.CreateSelect(
             true_cond_lv, cgen_state_->llInt((int8_t)1), cgen_state_->llInt((int8_t)0));
       }
       break;
     default:
       break;
   }
   return res_cond_lv;
 }

llvm::Value * Executor::codegenCurrentPartitionIndex	(	const WindowFunctionContext *	window_func_context,
		CodeGenerator *	code_generator,
		const CompilationOptions &	co,
		llvm::Value *	current_row_pos_lv
	)

private

Definition at line 781 of file WindowFunctionIR.cpp.

References CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowFunctionContext::elementCount(), get_int_type(), WindowFunctionContext::getWindowFunction(), Analyzer::WindowFunction::isFrameNavigateWindowFunction(), WindowFunctionContext::NUM_EXECUTION_DEVICES, WindowFunctionContext::partitionCount(), WindowFunctionContext::partitionNumCountBuf(), and WindowFunctionContext::payload().

                                    {
   const auto pi64_type =
       llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
   const auto pi32_type =
       llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
   auto row_pos_lv = current_row_pos_lv;
   if (window_func_context->getWindowFunction()->isFrameNavigateWindowFunction()) {
     // `current_row_pos_lv` indicates the index of the current row, but to figure out
     // it's index of window partition it belongs to, we need a special approach
     // especially for window framing navigation function  for instance, when we have
     // five rows having two columns pc and val such as (2,1), (2,2), (2,3), (1,1),
     // (1,2), we build a OneToMany Perfect Hash Table as: offset: 0 2 / count: 2 3 /
     // payload: i1, i2, i3, i4, i5 where i1 ~ i3 and i4 ~ i5 are rows for partition 1
     // (i.e., pc = 1) and 2 (i.e., prc = 2), respectively. But when processing the first
     // row (2, 1), the original `current_row_pos_lv` stands for zero so computing which
     // partitions it belongs to is hard unless hashing the value at runtime. Even if we
     // do hash, we cannot know the exact hash slot unless we do binary + linear searches
     // multiple times (via payload buffer and the ordered payload buffer) i.e., when the
     // row (1,2) is assigned to the partition[4], we cannot find the hash slot index '4'
     // by using `current_row_pos_lv` unless doing a costly operation like a linear
     // search over the entire window partition Instead, we collect a hash slot that each
     // row is assigned to and keep this info at the payload buffer
     // `hash_slot_idx_ptr_lv` and use it for computing window frame navigation functions
     auto* const hash_slot_idx_ptr =
         window_func_context->payload() + window_func_context->elementCount();
     auto hash_slot_idx_buf_lv =
         cgen_state_->llInt(reinterpret_cast<int64_t>(hash_slot_idx_ptr));
     auto hash_slot_idx_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                     cgen_state_.get(),
                                     code_generator,
                                     co,
                                     hash_slot_idx_buf_lv,
                                     pi32_type,
                                     WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                     .front();
     auto hash_slot_idx_load_lv = cgen_state_->ir_builder_.CreateGEP(
         hash_slot_idx_ptr_lv->getType()->getPointerElementType(),
         hash_slot_idx_ptr_lv,
         current_row_pos_lv);
     row_pos_lv = cgen_state_->castToTypeIn(
         cgen_state_->ir_builder_.CreateLoad(
             hash_slot_idx_load_lv->getType()->getPointerElementType(),
             hash_slot_idx_load_lv,
             "cur_row_hash_slot_idx"),
         64);
   }
   auto partition_count_lv = cgen_state_->llInt(window_func_context->partitionCount());
   auto partition_num_count_buf_lv = cgen_state_->llInt(
       reinterpret_cast<int64_t>(window_func_context->partitionNumCountBuf()));
   auto partition_num_count_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                         cgen_state_.get(),
                                         code_generator,
                                         co,
                                         partition_num_count_buf_lv,
                                         pi64_type,
                                         WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                         .front();
   return cgen_state_->emitCall(
       "compute_int64_t_lower_bound",
       {partition_count_lv, row_pos_lv, partition_num_count_ptr_lv});
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenFrameBound	(	bool	for_start_bound,
		bool	for_range_mode,
		bool	for_window_frame_naviation,
		const Analyzer::WindowFrame *	frame_bound,
		bool	is_timestamp_type_frame,
		llvm::Value *	order_key_null_val,
		const WindowFrameBoundFuncArgs &	args
	)

private

Definition at line 649 of file WindowFunctionIR.cpp.

References CHECK, CURRENT_ROW, WindowFrameBoundFuncArgs::current_row_pos_lv, EXPR_FOLLOWING, EXPR_PRECEDING, WindowFrameBoundFuncArgs::frame_end_bound_expr_lv, WindowFrameBoundFuncArgs::frame_start_bound_expr_lv, Analyzer::WindowFrame::getBoundType(), WindowFrameBoundFuncArgs::int64_t_one_val_lv, WindowFrameBoundFuncArgs::int64_t_zero_val_lv, WindowFrameBoundFuncArgs::num_elem_current_partition_lv, WindowFrameBoundFuncArgs::order_type_col_name, UNBOUNDED_FOLLOWING, and UNBOUNDED_PRECEDING.

                                                                                {
   const auto bound_type = frame_bound->getBoundType();
   auto adjust_frame_end_bound = [&](llvm::Value* target_bound_lv) {
     return cgen_state_->ir_builder_.CreateSub(target_bound_lv, args.int64_t_one_val_lv);
   };
   if (bound_type == SqlWindowFrameBoundType::UNBOUNDED_PRECEDING) {
     CHECK(for_start_bound) << "frame end cannot be UNBOUNDED PRECEDING";
     return args.int64_t_zero_val_lv;
   } else if (bound_type == SqlWindowFrameBoundType::UNBOUNDED_FOLLOWING) {
     CHECK(!for_start_bound) << "frame start cannot be UNBOUNDED FOLLOWING";
     // adjust frame bound w.r.t the open frame interval if necessary
     return for_window_frame_naviation
                ? adjust_frame_end_bound(args.num_elem_current_partition_lv)
                : args.num_elem_current_partition_lv;
   }
   std::vector<llvm::Value*> func_args;
   std::string op_name =
       bound_type == SqlWindowFrameBoundType::EXPR_FOLLOWING ? "add" : "sub";
   if (!for_range_mode) {
     llvm::Value* current_row_bound_expr_lv{nullptr};
     if (for_window_frame_naviation) {
       // we already know a current row's index in (ordered) window frame in this case
       auto bound_expr =
           for_start_bound ? args.frame_start_bound_expr_lv : args.frame_end_bound_expr_lv;
       if (bound_type == SqlWindowFrameBoundType::EXPR_FOLLOWING) {
         current_row_bound_expr_lv =
             cgen_state_->ir_builder_.CreateAdd(args.current_row_pos_lv, bound_expr);
       } else if (bound_type == SqlWindowFrameBoundType::EXPR_PRECEDING) {
         current_row_bound_expr_lv =
             cgen_state_->ir_builder_.CreateSub(args.current_row_pos_lv, bound_expr);
       } else {
         CHECK(bound_type == SqlWindowFrameBoundType::CURRENT_ROW);
         current_row_bound_expr_lv = args.current_row_pos_lv;
       }
       // adjust frame bound w.r.t the open frame interval
       if (for_start_bound) {
         return cgen_state_->ir_builder_.CreateSelect(
             cgen_state_->ir_builder_.CreateICmpSLT(current_row_bound_expr_lv,
                                                    args.int64_t_zero_val_lv),
             args.int64_t_zero_val_lv,
             current_row_bound_expr_lv);
       } else {
         return cgen_state_->ir_builder_.CreateSelect(
             cgen_state_->ir_builder_.CreateICmpSGE(current_row_bound_expr_lv,
                                                    args.num_elem_current_partition_lv),
             adjust_frame_end_bound(args.num_elem_current_partition_lv),
             current_row_bound_expr_lv);
       }
     } else {
       std::string func_class = for_start_bound ? "start" : "end";
       auto const func_name = "compute_row_mode_" + func_class + "_index_" + op_name;
       func_args = prepareRowModeFuncArgs(for_start_bound, bound_type, args);
       current_row_bound_expr_lv = cgen_state_->emitCall(func_name, func_args);
     }
     return current_row_bound_expr_lv;
   } else {
     std::string func_class = for_start_bound ? "lower" : "upper";
     auto const func_name = getFramingFuncName(
         func_class,
         args.order_type_col_name,
         op_name,
         bound_type != SqlWindowFrameBoundType::CURRENT_ROW && is_timestamp_type_frame);
     func_args = prepareRangeModeFuncArgs(
         for_start_bound, frame_bound, is_timestamp_type_frame, order_key_null_val, args);
     auto frame_bound_lv = cgen_state_->emitCall(func_name, func_args);
     if (!for_start_bound && for_window_frame_naviation) {
       // adjust frame end bound w.r.t the open frame interval
       frame_bound_lv = cgen_state_->ir_builder_.CreateSelect(
           cgen_state_->ir_builder_.CreateICmpSGE(frame_bound_lv,
                                                  args.num_elem_current_partition_lv),
           adjust_frame_end_bound(args.num_elem_current_partition_lv),
           frame_bound_lv);
     }
     return frame_bound_lv;
   }
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenFrameBoundExpr	(	const Analyzer::WindowFunction *	window_func,
		const Analyzer::WindowFrame *	frame_bound,
		CodeGenerator &	code_generator,
		const CompilationOptions &	co
	)

private

Definition at line 598 of file WindowFunctionIR.cpp.

References CHECK, CodeGenerator::codegen(), EXPR_FOLLOWING, EXPR_PRECEDING, g_cluster, SQLTypeInfo::get_size(), Analyzer::Expr::get_type_info(), Analyzer::WindowFrame::getBoundExpr(), Analyzer::WindowFunction::getOrderKeys(), Analyzer::WindowFunction::hasRangeModeFraming(), kBIGINT, kINT, and kSMALLINT.

                                                                            {
   auto needs_bound_expr_codegen = [](const Analyzer::WindowFrame* window_frame) {
     return window_frame->getBoundType() == SqlWindowFrameBoundType::EXPR_FOLLOWING ||
            window_frame->getBoundType() == SqlWindowFrameBoundType::EXPR_PRECEDING;
   };
   const auto order_col_ti = window_func->getOrderKeys().front()->get_type_info();
   auto encode_date_col_val = [&order_col_ti, this](llvm::Value* bound_expr_lv) {
     if (order_col_ti.get_comp_param() == 16) {
       return cgen_state_->emitCall(
           "fixed_width_date_encode_noinline",
           {bound_expr_lv,
            cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(SQLTypeInfo(kSMALLINT)),
                                      32),
            cgen_state_->inlineIntNull(SQLTypeInfo(kBIGINT))});
     } else {
       return cgen_state_->emitCall("fixed_width_date_encode_noinline",
                                    {bound_expr_lv,
                                     cgen_state_->inlineIntNull(SQLTypeInfo(kINT)),
                                     cgen_state_->inlineIntNull(SQLTypeInfo(kBIGINT))});
     }
   };
   llvm::Value* bound_expr_lv{nullptr};
   if (needs_bound_expr_codegen(frame_bound)) {
     auto bound_expr = frame_bound->getBoundExpr();
     if (auto dateadd_expr = dynamic_cast<const Analyzer::DateaddExpr*>(bound_expr)) {
       if (dateadd_expr->get_datetime_expr()->get_type_info().is_encoded_timestamp()) {
         dateadd_expr->set_fixed_encoding_null_val();
       }
     }
     auto bound_expr_lvs = code_generator.codegen(bound_expr, true, co);
     bound_expr_lv = bound_expr_lvs.front();
     if (order_col_ti.is_date() && window_func->hasRangeModeFraming()) {
       if (g_cluster) {
         throw std::runtime_error(
             "Range mode with date type ordering column is not supported yet.");
       }
       bound_expr_lv = encode_date_col_val(bound_expr_lv);
     }
     if (frame_bound->getBoundExpr()->get_type_info().get_size() != 8) {
       bound_expr_lv = cgen_state_->castToTypeIn(bound_expr_lv, 64);
     }
   } else {
     bound_expr_lv = cgen_state_->llInt((int64_t)-1);
   }
   CHECK(bound_expr_lv);
   return bound_expr_lv;
 }

Here is the call graph for this function:

std::pair< llvm::Value , llvm::Value > Executor::codegenFrameBoundRange	(	const Analyzer::WindowFunction *	window_func,
		CodeGenerator &	code_generator,
		const CompilationOptions &	co
	)

private

Definition at line 1065 of file WindowFunctionIR.cpp.

References CHECK, Analyzer::WindowFunction::getFrameEndBound(), and Analyzer::WindowFunction::getFrameStartBound().

                                   {
   const auto frame_start_bound = window_func->getFrameStartBound();
   const auto frame_end_bound = window_func->getFrameEndBound();
   auto frame_start_bound_expr_lv =
       codegenFrameBoundExpr(window_func, frame_start_bound, code_generator, co);
   auto frame_end_bound_expr_lv =
       codegenFrameBoundExpr(window_func, frame_end_bound, code_generator, co);
   CHECK(frame_start_bound_expr_lv);
   CHECK(frame_end_bound_expr_lv);
   return std::make_pair(frame_start_bound_expr_lv, frame_end_bound_expr_lv);
 }

Here is the call graph for this function:

std::pair< llvm::Value , llvm::Value > Executor::codegenFrameNullRange	(	WindowFunctionContext *	window_func_context,
		CodeGenerator *	code_generator,
		const CompilationOptions &	co,
		llvm::Value *	partition_index_lv
	)		const

private

Definition at line 904 of file WindowFunctionIR.cpp.

References CodegenUtil::createPtrWithHoistedMemoryAddr(), get_int_type(), WindowFunctionContext::getNullValueEndPos(), WindowFunctionContext::getNullValueStartPos(), and WindowFunctionContext::NUM_EXECUTION_DEVICES.

                                          {
   const auto pi64_type =
       llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
   const auto null_start_pos_buf = cgen_state_->llInt(
       reinterpret_cast<int64_t>(window_func_context->getNullValueStartPos()));
   const auto null_start_pos_buf_ptr = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                           cgen_state_.get(),
                                           code_generator,
                                           co,
                                           null_start_pos_buf,
                                           pi64_type,
                                           WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                           .front();
   const auto null_start_pos_ptr =
       cgen_state_->ir_builder_.CreateGEP(get_int_type(64, cgen_state_->context_),
                                          null_start_pos_buf_ptr,
                                          partition_index_lv);
   auto null_start_pos_lv = cgen_state_->ir_builder_.CreateLoad(
       null_start_pos_ptr->getType()->getPointerElementType(),
       null_start_pos_ptr,
       "null_start_pos");
   const auto null_end_pos_buf = cgen_state_->llInt(
       reinterpret_cast<int64_t>(window_func_context->getNullValueEndPos()));
   const auto null_end_pos_buf_ptr = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                         cgen_state_.get(),
                                         code_generator,
                                         co,
                                         null_end_pos_buf,
                                         pi64_type,
                                         WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                         .front();
   const auto null_end_pos_ptr = cgen_state_->ir_builder_.CreateGEP(
       get_int_type(64, cgen_state_->context_), null_end_pos_buf_ptr, partition_index_lv);
   auto null_end_pos_lv = cgen_state_->ir_builder_.CreateLoad(
       null_end_pos_ptr->getType()->getPointerElementType(),
       null_end_pos_ptr,
       "null_end_pos");
   return std::make_pair(null_start_pos_lv, null_end_pos_lv);
 }

Here is the call graph for this function:

void Executor::codegenJoinLoops	(	const std::vector< JoinLoop > &	join_loops,
		const RelAlgExecutionUnit &	ra_exe_unit,
		GroupByAndAggregate &	group_by_and_aggregate,
		llvm::Function *	query_func,
		llvm::BasicBlock *	entry_bb,
		QueryMemoryDescriptor &	query_mem_desc,
		const CompilationOptions &	co,
		const ExecutionOptions &	eo
	)

private

Definition at line 1204 of file IRCodegen.cpp.

References ExecutionOptions::allow_runtime_query_interrupt, anonymous_namespace{QueryMemoryDescriptor.cpp}::any_of(), AUTOMATIC_IR_METADATA, CHECK, CHECK_EQ, JoinLoop::codegen(), CompilationOptions::device_type, JoinLoopDomain::element_count, get_int_array_type(), get_int_type(), INNER, MultiSet, CodeGenerator::posArg(), GroupByAndAggregate::query_infos_, query_mem_desc, Set, and ExecutionOptions::with_dynamic_watchdog.

                                                             {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto exit_bb =
       llvm::BasicBlock::Create(cgen_state_->context_, "exit", cgen_state_->current_func_);
   cgen_state_->ir_builder_.SetInsertPoint(exit_bb);
   cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
   cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
   CodeGenerator code_generator(this);
 
   llvm::BasicBlock* loops_entry_bb{nullptr};
   auto has_range_join =
       std::any_of(join_loops.begin(), join_loops.end(), [](const auto& join_loop) {
         return join_loop.kind() == JoinLoopKind::MultiSet;
       });
   if (has_range_join) {
     CHECK_EQ(join_loops.size(), size_t(1));
     const auto element_count =
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_), 9);
 
     auto compute_packed_offset = [](const int32_t x, const int32_t y) -> uint64_t {
       const uint64_t y_shifted = static_cast<uint64_t>(y) << 32;
       return y_shifted | static_cast<uint32_t>(x);
     };
 
     const auto values_arr = std::vector<llvm::Constant*>{
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_), 0),
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
                                compute_packed_offset(0, 1)),
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
                                compute_packed_offset(0, -1)),
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
                                compute_packed_offset(1, 0)),
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
                                compute_packed_offset(1, 1)),
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
                                compute_packed_offset(1, -1)),
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
                                compute_packed_offset(-1, 0)),
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
                                compute_packed_offset(-1, 1)),
         llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
                                compute_packed_offset(-1, -1))};
 
     const auto constant_values_array = llvm::ConstantArray::get(
         get_int_array_type(64, 9, cgen_state_->context_), values_arr);
     CHECK(cgen_state_->module_);
     const auto values =
         new llvm::GlobalVariable(*cgen_state_->module_,
                                  get_int_array_type(64, 9, cgen_state_->context_),
                                  true,
                                  llvm::GlobalValue::LinkageTypes::InternalLinkage,
                                  constant_values_array);
     JoinLoop join_loop(
         JoinLoopKind::Set,
         JoinType::INNER,
         [element_count, values](const std::vector<llvm::Value*>& v) {
           JoinLoopDomain domain{{0}};
           domain.element_count = element_count;
           domain.values_buffer = values;
           return domain;
         },
         nullptr,
         nullptr,
         nullptr,
         nullptr,
         "range_key_loop");
 
     loops_entry_bb = JoinLoop::codegen(
         {join_loop},
         [this,
          query_func,
          &query_mem_desc,
          &co,
          &eo,
          &group_by_and_aggregate,
          &join_loops,
          &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
           auto& builder = cgen_state_->ir_builder_;
 
           auto body_exit_bb =
               llvm::BasicBlock::Create(cgen_state_->context_,
                                        "range_key_inner_body_exit",
                                        builder.GetInsertBlock()->getParent());
 
           auto range_key_body_bb =
               llvm::BasicBlock::Create(cgen_state_->context_,
                                        "range_key_loop_body",
                                        builder.GetInsertBlock()->getParent());
           builder.SetInsertPoint(range_key_body_bb);
 
           const auto body_loops_entry_bb = JoinLoop::codegen(
               join_loops,
               [this,
                query_func,
                &query_mem_desc,
                &co,
                &eo,
                &group_by_and_aggregate,
                &join_loops,
                &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
                 addJoinLoopIterator(prev_iters, join_loops.size());
                 auto& builder = cgen_state_->ir_builder_;
                 const auto loop_body_bb =
                     llvm::BasicBlock::Create(builder.getContext(),
                                              "loop_body",
                                              builder.GetInsertBlock()->getParent());
                 builder.SetInsertPoint(loop_body_bb);
                 const bool can_return_error =
                     compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
                 if (can_return_error || cgen_state_->needs_error_check_ ||
                     eo.with_dynamic_watchdog || eo.allow_runtime_query_interrupt) {
                   createErrorCheckControlFlow(query_func,
                                               eo.with_dynamic_watchdog,
                                               eo.allow_runtime_query_interrupt,
                                               join_loops,
                                               co.device_type,
                                               group_by_and_aggregate.query_infos_);
                 }
                 return loop_body_bb;
               },
               prev_iters.back(),
               body_exit_bb,
               cgen_state_.get());
 
           builder.SetInsertPoint(range_key_body_bb);
           cgen_state_->ir_builder_.CreateBr(body_loops_entry_bb);
 
           builder.SetInsertPoint(body_exit_bb);
           return range_key_body_bb;
         },
         code_generator.posArg(nullptr),
         exit_bb,
         cgen_state_.get());
   } else {
     loops_entry_bb = JoinLoop::codegen(
         join_loops,
         /*body_codegen=*/
         [this,
          query_func,
          &query_mem_desc,
          &co,
          &eo,
          &group_by_and_aggregate,
          &join_loops,
          &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
           AUTOMATIC_IR_METADATA(cgen_state_.get());
           addJoinLoopIterator(prev_iters, join_loops.size());
           auto& builder = cgen_state_->ir_builder_;
           const auto loop_body_bb = llvm::BasicBlock::Create(
               builder.getContext(), "loop_body", builder.GetInsertBlock()->getParent());
           builder.SetInsertPoint(loop_body_bb);
           const bool can_return_error =
               compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
           if (can_return_error || cgen_state_->needs_error_check_ ||
               eo.with_dynamic_watchdog || eo.allow_runtime_query_interrupt) {
             createErrorCheckControlFlow(query_func,
                                         eo.with_dynamic_watchdog,
                                         eo.allow_runtime_query_interrupt,
                                         join_loops,
                                         co.device_type,
                                         group_by_and_aggregate.query_infos_);
           }
           return loop_body_bb;
         },
         /*outer_iter=*/code_generator.posArg(nullptr),
         exit_bb,
         cgen_state_.get());
   }
   CHECK(loops_entry_bb);
   cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
   cgen_state_->ir_builder_.CreateBr(loops_entry_bb);
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenLoadCurrentValueFromColBuf	(	WindowFunctionContext *	window_func_context,
		CodeGenerator &	code_generator,
		WindowFrameBoundFuncArgs &	args
	)		const

private

Definition at line 753 of file WindowFunctionIR.cpp.

References CHECK, CodeGenerator::codegenWindowPosition(), WindowFrameBoundFuncArgs::current_row_pos_lv, get_fp_type(), get_int_type(), Analyzer::WindowFunction::getOrderKeys(), WindowFunctionContext::getWindowFunction(), Analyzer::WindowFunction::isFrameNavigateWindowFunction(), and WindowFrameBoundFuncArgs::order_key_buf_ptr_lv.

                                           {
   llvm::Value* current_col_value_ptr_lv{nullptr};
   const auto order_key_size_in_byte = getOrderKeySize(window_func_context) * 8;
   auto const order_key_ptr =
       window_func_context->getWindowFunction()->getOrderKeys().front();
   CHECK(order_key_ptr);
   auto const order_col_ti = order_key_ptr->get_type_info();
   auto const order_col_llvm_type =
       order_col_ti.is_fp() ? get_fp_type(order_key_size_in_byte, cgen_state_->context_)
                            : get_int_type(order_key_size_in_byte, cgen_state_->context_);
   if (!window_func_context->getWindowFunction()->isFrameNavigateWindowFunction()) {
     auto rowid_in_partition_lv = code_generator.codegenWindowPosition(
         window_func_context, args.current_row_pos_lv);
     current_col_value_ptr_lv = cgen_state_->ir_builder_.CreateGEP(
         order_col_llvm_type, args.order_key_buf_ptr_lv, rowid_in_partition_lv);
   } else {
     current_col_value_ptr_lv = cgen_state_->ir_builder_.CreateGEP(
         order_col_llvm_type, args.order_key_buf_ptr_lv, args.current_row_pos_lv);
   }
   return cgen_state_->ir_builder_.CreateLoad(
       current_col_value_ptr_lv->getType()->getPointerElementType(),
       current_col_value_ptr_lv,
       "current_col_value");
 }

Here is the call graph for this function:

std::pair< std::string, llvm::Value * > Executor::codegenLoadOrderKeyBufPtr	(	WindowFunctionContext *	window_func_context,
		CodeGenerator *	code_generator,
		const CompilationOptions &	co
	)		const

private

Definition at line 948 of file WindowFunctionIR.cpp.

References CodegenUtil::createPtrWithHoistedMemoryAddr(), anonymous_namespace{WindowFunctionIR.cpp}::get_col_type_name_by_size(), get_fp_type(), get_int_type(), WindowFunctionContext::getOrderKeyColumnBuffers(), WindowFunctionContext::getOrderKeyColumnBufferTypes(), Analyzer::WindowFunction::getOrderKeys(), WindowFunctionContext::getWindowFunction(), and WindowFunctionContext::NUM_EXECUTION_DEVICES.

                                         {
   auto const order_key_ti =
       window_func_context->getWindowFunction()->getOrderKeys().front()->get_type_info();
   auto const order_key_size = order_key_ti.get_size();
   auto const order_col_type_name = get_col_type_name_by_size(
       order_key_size,
       window_func_context->getOrderKeyColumnBufferTypes().front().is_fp());
   size_t order_key_size_in_byte = order_key_size * 8;
   auto const order_key_type =
       order_key_ti.is_fp() ? get_fp_type(order_key_size_in_byte, cgen_state_->context_)
                            : get_int_type(order_key_size_in_byte, cgen_state_->context_);
   auto const order_key_buf_type = llvm::PointerType::get(order_key_type, 0);
   auto const order_key_buf = cgen_state_->llInt(
       reinterpret_cast<int64_t>(window_func_context->getOrderKeyColumnBuffers().front()));
   auto const order_key_buf_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                         cgen_state_.get(),
                                         code_generator,
                                         co,
                                         order_key_buf,
                                         order_key_buf_type,
                                         WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                         .front();
   return std::make_pair(order_col_type_name, order_key_buf_ptr_lv);
 }

Here is the call graph for this function:

WindowPartitionBufferPtrs Executor::codegenLoadPartitionBuffers	(	WindowFunctionContext *	window_func_context,
		CodeGenerator *	code_generator,
		const CompilationOptions &	co,
		llvm::Value *	partition_index_lv
	)		const

private

Definition at line 976 of file WindowFunctionIR.cpp.

References WindowFunctionContext::counts(), CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowPartitionBufferPtrs::current_partition_start_offset_lv, get_int_type(), WindowPartitionBufferPtrs::num_elem_current_partition_lv, WindowFunctionContext::NUM_EXECUTION_DEVICES, WindowFunctionContext::partitionStartOffset(), WindowFunctionContext::payload(), WindowFunctionContext::sortedPartition(), WindowPartitionBufferPtrs::target_partition_rowid_ptr_lv, and WindowPartitionBufferPtrs::target_partition_sorted_rowid_ptr_lv.

                                          {
   WindowPartitionBufferPtrs bufferPtrs;
   const auto pi64_type =
       llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
   const auto pi32_type =
       llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
 
   // partial sum of # elems of partitions
   auto partition_start_offset_buf_lv = cgen_state_->llInt(
       reinterpret_cast<int64_t>(window_func_context->partitionStartOffset()));
   auto partition_start_offset_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                            cgen_state_.get(),
                                            code_generator,
                                            co,
                                            partition_start_offset_buf_lv,
                                            pi64_type,
                                            WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                            .front();
 
   // get start offset of the current partition
   auto current_partition_start_offset_ptr_lv =
       cgen_state_->ir_builder_.CreateGEP(get_int_type(64, cgen_state_->context_),
                                          partition_start_offset_ptr_lv,
                                          partition_index_lv);
   bufferPtrs.current_partition_start_offset_lv = cgen_state_->ir_builder_.CreateLoad(
       current_partition_start_offset_ptr_lv->getType()->getPointerElementType(),
       current_partition_start_offset_ptr_lv);
 
   // row_id buf of the current partition
   const auto partition_rowid_buf_lv =
       cgen_state_->llInt(reinterpret_cast<int64_t>(window_func_context->payload()));
   const auto partition_rowid_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                           cgen_state_.get(),
                                           code_generator,
                                           co,
                                           partition_rowid_buf_lv,
                                           pi32_type,
                                           WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                           .front();
   bufferPtrs.target_partition_rowid_ptr_lv =
       cgen_state_->ir_builder_.CreateGEP(get_int_type(32, cgen_state_->context_),
                                          partition_rowid_ptr_lv,
                                          bufferPtrs.current_partition_start_offset_lv);
 
   // row_id buf of ordered current partition
   const auto sorted_rowid_lv = cgen_state_->llInt(
       reinterpret_cast<int64_t>(window_func_context->sortedPartition()));
   const auto sorted_rowid_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                        cgen_state_.get(),
                                        code_generator,
                                        co,
                                        sorted_rowid_lv,
                                        pi64_type,
                                        WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                        .front();
   bufferPtrs.target_partition_sorted_rowid_ptr_lv =
       cgen_state_->ir_builder_.CreateGEP(get_int_type(64, cgen_state_->context_),
                                          sorted_rowid_ptr_lv,
                                          bufferPtrs.current_partition_start_offset_lv);
 
   // # elems per partition
   const auto partition_count_buf =
       cgen_state_->llInt(reinterpret_cast<int64_t>(window_func_context->counts()));
   auto partition_count_buf_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                         cgen_state_.get(),
                                         code_generator,
                                         co,
                                         partition_count_buf,
                                         pi32_type,
                                         WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                         .front();
 
   // # elems of the given partition
   const auto num_elem_current_partition_ptr =
       cgen_state_->ir_builder_.CreateGEP(get_int_type(32, cgen_state_->context_),
                                          partition_count_buf_ptr_lv,
                                          partition_index_lv);
   bufferPtrs.num_elem_current_partition_lv = cgen_state_->castToTypeIn(
       cgen_state_->ir_builder_.CreateLoad(
           num_elem_current_partition_ptr->getType()->getPointerElementType(),
           num_elem_current_partition_ptr),
       64);
   return bufferPtrs;
 }

Here is the call graph for this function:

llvm::BasicBlock * Executor::codegenSkipDeletedOuterTableRow	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const CompilationOptions &	co
	)

private

Definition at line 3311 of file NativeCodegen.cpp.

                                   {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   if (!co.filter_on_deleted_column) {
     return nullptr;
   }
   CHECK(!ra_exe_unit.input_descs.empty());
   const auto& outer_input_desc = ra_exe_unit.input_descs[0];
   if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
     return nullptr;
   }
   const auto& table_key = outer_input_desc.getTableKey();
   const auto deleted_cd = plan_state_->getDeletedColForTable(table_key);
   if (!deleted_cd) {
     return nullptr;
   }
   CHECK(deleted_cd->columnType.is_boolean());
   const auto deleted_expr =
       makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
                                     shared::ColumnKey{table_key, deleted_cd->columnId},
                                     outer_input_desc.getNestLevel());
   CodeGenerator code_generator(this);
   const auto is_deleted =
       code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
   const auto is_deleted_bb = llvm::BasicBlock::Create(
       cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
   llvm::BasicBlock* bb = llvm::BasicBlock::Create(
       cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
   cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
   cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
   cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
   cgen_state_->ir_builder_.SetInsertPoint(bb);
   return bb;
 }

void Executor::codegenWindowAvgEpilogue	(	CodeGenerator *	code_generator,
		const CompilationOptions &	co,
		llvm::Value *	crt_val,
		llvm::Value *	window_func_null_val
	)

private

Definition at line 1466 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, CodegenUtil::createPtrWithHoistedMemoryAddr(), anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, kFLOAT, and WindowFunctionContext::NUM_EXECUTION_DEVICES.

                                                                          {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto window_func_context =
       WindowProjectNodeContext::getActiveWindowFunctionContext(this);
   const auto window_func = window_func_context->getWindowFunction();
   const auto window_func_ti = get_adjusted_window_type_info(window_func);
   const auto pi32_type =
       llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
   const auto pi64_type =
       llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
   const auto aggregate_state_type =
       window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
   const auto aggregate_state_count_i64 = cgen_state_->llInt(
       reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
   auto aggregate_state_count = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                    cgen_state_.get(),
                                    code_generator,
                                    co,
                                    aggregate_state_count_i64,
                                    aggregate_state_type,
                                    WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                    .front();
   std::string agg_count_func_name = "agg_count";
   switch (window_func_ti.get_type()) {
     case kFLOAT: {
       agg_count_func_name += "_float";
       break;
     }
     case kDOUBLE: {
       agg_count_func_name += "_double";
       break;
     }
     default: {
       break;
     }
   }
   agg_count_func_name += "_skip_val";
   cgen_state_->emitCall(agg_count_func_name,
                         {aggregate_state_count, crt_val, window_func_null_val});
 }

Here is the call graph for this function:

std::pair< llvm::Value , llvm::Value > Executor::codegenWindowFrameBounds	(	WindowFunctionContext *	window_func_context,
		const Analyzer::WindowFrame *	frame_start_bound,
		const Analyzer::WindowFrame *	frame_end_bound,
		llvm::Value *	order_key_col_null_val_lv,
		WindowFrameBoundFuncArgs &	args,
		CodeGenerator &	code_generator
	)

private

Definition at line 1080 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, CHECK, WindowFrameBoundFuncArgs::current_col_value_lv, WindowFunctionContext::getOrderKeyColumnBuffers(), WindowFunctionContext::getWindowFunction(), Analyzer::WindowFrame::hasTimestampTypeFrameBound(), and WindowFrameBoundFuncArgs::order_type_col_name.

                                    {
   const auto window_func = window_func_context->getWindowFunction();
   CHECK(window_func);
   const auto is_timestamp_type_frame = frame_start_bound->hasTimestampTypeFrameBound() ||
                                        frame_end_bound->hasTimestampTypeFrameBound();
 
   if (window_func->hasRangeModeFraming()) {
     CHECK(window_func_context->getOrderKeyColumnBuffers().size() == 1);
     CHECK(window_func->getOrderKeys().size() == 1UL);
     CHECK(window_func_context->getOrderKeyColumnBuffers().size() == 1UL);
     args.order_type_col_name = getOrderKeyTypeName(window_func_context);
     args.current_col_value_lv =
         codegenLoadCurrentValueFromColBuf(window_func_context, code_generator, args);
   }
 
   auto get_order_key_null_val = [is_timestamp_type_frame,
                                  &order_key_col_null_val_lv,
                                  this](const Analyzer::WindowFrame* frame_bound) {
     return is_timestamp_type_frame && !frame_bound->isCurrentRowBound()
                ? cgen_state_->castToTypeIn(order_key_col_null_val_lv, 64)
                : order_key_col_null_val_lv;
   };
   auto frame_start_bound_lv =
       codegenFrameBound(true,
                         window_func->hasRangeModeFraming(),
                         window_func->isFrameNavigateWindowFunction(),
                         frame_start_bound,
                         is_timestamp_type_frame,
                         get_order_key_null_val(frame_start_bound),
                         args);
   auto frame_end_bound_lv =
       codegenFrameBound(false,
                         window_func->hasRangeModeFraming(),
                         window_func->isFrameNavigateWindowFunction(),
                         frame_end_bound,
                         is_timestamp_type_frame,
                         get_order_key_null_val(frame_end_bound),
                         args);
   CHECK(frame_start_bound_lv);
   CHECK(frame_end_bound_lv);
   return std::make_pair(frame_start_bound_lv, frame_end_bound_lv);
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunction	(	const size_t	target_index,
		const CompilationOptions &	co
	)

private

Definition at line 22 of file WindowFunctionIR.cpp.

References WindowProjectNodeContext::activateWindowFunctionContext(), run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, BACKWARD_FILL, CHECK, CHECK_EQ, CONDITIONAL_CHANGE_EVENT, COUNT, COUNT_IF, CUME_DIST, DENSE_RANK, logger::FATAL, FIRST_VALUE, FIRST_VALUE_IN_FRAME, FORWARD_FILL, WindowProjectNodeContext::get(), WindowFunctionContext::getWindowFunction(), LAG, LAG_IN_FRAME, LAST_VALUE, LAST_VALUE_IN_FRAME, LEAD, LEAD_IN_FRAME, LOG, MAX, MIN, NTH_VALUE, NTH_VALUE_IN_FRAME, NTILE, PERCENT_RANK, RANK, ROW_NUMBER, SUM, and SUM_IF.

                                                                            {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   CodeGenerator code_generator(this);
 
   const auto window_func_context =
       WindowProjectNodeContext::get(this)->activateWindowFunctionContext(this,
                                                                          target_index);
   const auto window_func = window_func_context->getWindowFunction();
   switch (window_func->getKind()) {
     case SqlWindowFunctionKind::ROW_NUMBER:
     case SqlWindowFunctionKind::RANK:
     case SqlWindowFunctionKind::DENSE_RANK:
     case SqlWindowFunctionKind::NTILE:
       return code_generator.codegenWindowPosition(window_func_context,
                                                   code_generator.posArg(nullptr));
     case SqlWindowFunctionKind::PERCENT_RANK:
     case SqlWindowFunctionKind::CUME_DIST:
       return cgen_state_->emitCall("percent_window_func",
                                    {cgen_state_->llInt(reinterpret_cast<const int64_t>(
                                         window_func_context->output())),
                                     code_generator.posArg(nullptr)});
     case SqlWindowFunctionKind::LAG:
     case SqlWindowFunctionKind::LEAD:
     case SqlWindowFunctionKind::FIRST_VALUE:
     case SqlWindowFunctionKind::LAST_VALUE:
     case SqlWindowFunctionKind::NTH_VALUE: {
       // they are always evaluated on the current frame
       CHECK(WindowProjectNodeContext::get(this));
       const auto& args = window_func->getArgs();
       CHECK(!args.empty());
       const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
       CHECK_EQ(arg_lvs.size(), size_t(1));
       return arg_lvs.front();
     }
     case SqlWindowFunctionKind::AVG:
     case SqlWindowFunctionKind::MIN:
     case SqlWindowFunctionKind::MAX:
     case SqlWindowFunctionKind::SUM:
     case SqlWindowFunctionKind::SUM_IF:
     case SqlWindowFunctionKind::COUNT:
     case SqlWindowFunctionKind::COUNT_IF:
     case SqlWindowFunctionKind::CONDITIONAL_CHANGE_EVENT:
       return codegenWindowFunctionAggregate(&code_generator, co);
     case SqlWindowFunctionKind::LEAD_IN_FRAME:
     case SqlWindowFunctionKind::LAG_IN_FRAME:
     case SqlWindowFunctionKind::NTH_VALUE_IN_FRAME:
     case SqlWindowFunctionKind::FIRST_VALUE_IN_FRAME:
     case SqlWindowFunctionKind::LAST_VALUE_IN_FRAME:
     case SqlWindowFunctionKind::FORWARD_FILL:
     case SqlWindowFunctionKind::BACKWARD_FILL:
       return codegenWindowNavigationFunctionOnFrame(co);
     default:
       LOG(FATAL) << "Invalid window function kind";
   }
   return nullptr;
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregate	(	CodeGenerator *	code_generator,
		const CompilationOptions &	co
	)

private

Definition at line 265 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, CHECK, CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowProjectNodeContext::get(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), and WindowFunctionContext::NUM_EXECUTION_DEVICES.

                                                                                     {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   auto [reset_state_false_bb, aggregate_state] =
       codegenWindowResetStateControlFlow(code_generator, co);
   llvm::Value* aggregate_state_count = nullptr;
   const auto window_func_context =
       WindowProjectNodeContext::getActiveWindowFunctionContext(this);
   const auto window_func = window_func_context->getWindowFunction();
   if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
     const auto aggregate_state_count_i64 = cgen_state_->llInt(
         reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
     const auto pi64_type =
         llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
     aggregate_state_count = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                 cgen_state_.get(),
                                 code_generator,
                                 co,
                                 aggregate_state_count_i64,
                                 pi64_type,
                                 WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                 .front();
   }
   codegenWindowFunctionStateInit(code_generator, co, aggregate_state);
   if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
     const auto count_zero = cgen_state_->llInt(int64_t(0));
     cgen_state_->emitCall("agg_id", {aggregate_state_count, count_zero});
   }
   cgen_state_->ir_builder_.CreateBr(reset_state_false_bb);
   cgen_state_->ir_builder_.SetInsertPoint(reset_state_false_bb);
   CHECK(WindowProjectNodeContext::get(this));
   return codegenWindowFunctionAggregateCalls(aggregate_state, co);
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregateCalls	(	llvm::Value *	aggregate_state,
		const CompilationOptions &	co
	)

private

Definition at line 1129 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, CodeGenerator::codegen(), CodeGenerator::codegenCastBetweenIntTypes(), COUNT, COUNT_IF, CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowFrameBoundFuncArgs::current_partition_start_offset_lv, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), anonymous_namespace{WindowFunctionIR.cpp}::get_col_type_name_by_size(), get_int_type(), anonymous_namespace{WindowFunctionIR.cpp}::get_null_value_by_size(), anonymous_namespace{WindowFunctionIR.cpp}::get_window_agg_name(), WindowProjectNodeContext::getActiveWindowFunctionContext(), inline_fixed_encoding_null_val(), kDATE, kDOUBLE, kENCODING_DATE_IN_DAYS, kENCODING_FIXED, kFLOAT, kSUM_IF, kTIME, kTIMESTAMP, kTINYINT, MAX, MIN, WindowFunctionContext::NUM_EXECUTION_DEVICES, CodeGenerator::posArg(), SUM, SUM_IF, and window_function_conditional_aggregate().

                                                                                          {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto window_func_context =
       WindowProjectNodeContext::getActiveWindowFunctionContext(this);
   const auto window_func = window_func_context->getWindowFunction();
   const auto window_func_ti = get_adjusted_window_type_info(window_func);
   const auto window_func_null_val =
       window_func_ti.is_fp()
           ? cgen_state_->inlineFpNull(window_func_ti)
           : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
   if (window_func_context->elementCount() == 0) {
     // we do not need to generate a code for an empty input table
     return window_func->getKind() == SqlWindowFunctionKind::AVG
                ? cgen_state_->inlineFpNull(SQLTypeInfo(SQLTypes::kDOUBLE))
                : window_func_null_val;
   }
   const auto& args = window_func->getArgs();
   CodeGenerator code_generator(this);
   if (window_func_context->needsToBuildAggregateTree()) {
     // compute an aggregated value for each row of the window frame by using segment
     // tree when constructing a window context, we build a necessary segment tree (so
     // called `aggregate tree`) to query the aggregated value of the specific window
     // frame
     const auto pi64_type =
         llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
     const auto ppi64_type = llvm::PointerType::get(
         llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0), 0);
 
     auto [frame_start_bound_expr_lv, frame_end_bound_expr_lv] =
         codegenFrameBoundRange(window_func, code_generator, co);
 
     // compute aggregated value over the computed frame range
     auto current_row_pos_lv = code_generator.posArg(nullptr);
     auto partition_index_lv = codegenCurrentPartitionIndex(
         window_func_context, &code_generator, co, current_row_pos_lv);
 
     // ordering column buffer
     const auto target_col_ti = args.front()->get_type_info();
     const auto target_col_size = target_col_ti.get_size();
     const auto col_type_name =
         get_col_type_name_by_size(target_col_size, target_col_ti.is_fp());
 
     const auto partition_buf_ptrs = codegenLoadPartitionBuffers(
         window_func_context, &code_generator, co, partition_index_lv);
 
     auto [order_col_type_name, order_key_buf_ptr_lv] =
         codegenLoadOrderKeyBufPtr(window_func_context, &code_generator, co);
 
     // null value of the ordering column
     const auto order_key_buf_ti =
         window_func_context->getOrderKeyColumnBufferTypes().front();
     auto const ordering_spec = window_func->getCollation().front();
     llvm::Value* order_key_col_null_val_lv{nullptr};
     switch (order_key_buf_ti.get_type()) {
       case kDATE:
       case kTIMESTAMP:
       case kTIME: {
         if (order_key_buf_ti.get_compression() == kENCODING_FIXED ||
             order_key_buf_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
           auto null_val = inline_fixed_encoding_null_val(order_key_buf_ti);
           order_key_col_null_val_lv = cgen_state_->llInt((int32_t)null_val);
           break;
         }
       }
       default: {
         order_key_col_null_val_lv = cgen_state_->inlineNull(order_key_buf_ti);
         break;
       }
     }
 
     auto [null_start_pos_lv, null_end_pos_lv] = codegenFrameNullRange(
         window_func_context, &code_generator, co, partition_index_lv);
     auto nulls_first_lv = cgen_state_->llBool(ordering_spec.nulls_first);
 
     WindowFrameBoundFuncArgs WindowFrameBoundFuncArgs{
         frame_start_bound_expr_lv,
         frame_end_bound_expr_lv,
         current_row_pos_lv,
         nullptr,
         partition_buf_ptrs.current_partition_start_offset_lv,
         cgen_state_->llInt((int64_t)0),
         cgen_state_->llInt((int64_t)1),
         partition_buf_ptrs.num_elem_current_partition_lv,
         order_key_buf_ptr_lv,
         "",
         partition_buf_ptrs.target_partition_rowid_ptr_lv,
         partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
         nulls_first_lv,
         null_start_pos_lv,
         null_end_pos_lv};
     auto [frame_start_bound_lv, frame_end_bound_lv] =
         codegenWindowFrameBounds(window_func_context,
                                  window_func->getFrameStartBound(),
                                  window_func->getFrameEndBound(),
                                  order_key_col_null_val_lv,
                                  WindowFrameBoundFuncArgs,
                                  code_generator);
 
     // codegen to send a query with frame bound to aggregate tree searcher
     llvm::ConstantInt* aggregation_trees_lv{nullptr};
     llvm::Value* invalid_val_lv{nullptr};
     llvm::Value* null_val_lv{nullptr};
     std::string aggregation_tree_search_func_name{"search_"};
     std::string aggregation_tree_getter_func_name{"get_"};
 
     // prepare null values and aggregate_tree getter and searcher depending on
     // a type of the ordering column
     auto agg_expr_ti = args.front()->get_type_info();
     if (agg_expr_ti.is_fp()) {
       if (window_func->getKind() == SqlWindowFunctionKind::MIN) {
         invalid_val_lv = cgen_state_->llFp(std::numeric_limits<double>::max());
       } else if (window_func->getKind() == SqlWindowFunctionKind::MAX) {
         invalid_val_lv = cgen_state_->llFp(std::numeric_limits<double>::lowest());
       } else {
         invalid_val_lv = cgen_state_->llFp((double)0);
       }
       null_val_lv = cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE));
       aggregation_tree_search_func_name += "double";
       aggregation_tree_getter_func_name += "double";
     } else {
       if (window_func->getKind() == SqlWindowFunctionKind::MIN) {
         invalid_val_lv = cgen_state_->llInt(std::numeric_limits<int64_t>::max());
       } else if (window_func->getKind() == SqlWindowFunctionKind::MAX) {
         invalid_val_lv = cgen_state_->llInt(std::numeric_limits<int64_t>::lowest());
       } else {
         invalid_val_lv = cgen_state_->llInt((int64_t)0);
       }
       null_val_lv = cgen_state_->llInt(inline_int_null_value<int64_t>());
       aggregation_tree_search_func_name += "int64_t";
       aggregation_tree_getter_func_name += "integer";
     }
 
     // derived aggregation has a different code path
     if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
       aggregation_tree_search_func_name += "_derived";
       aggregation_tree_getter_func_name += "_derived";
     }
 
     // get a buffer holding aggregate trees for each partition
     if (agg_expr_ti.is_fp()) {
       if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
         aggregation_trees_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
             window_func_context->getDerivedAggregationTreesForDoubleTypeWindowExpr()));
       } else {
         aggregation_trees_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
             window_func_context->getAggregationTreesForDoubleTypeWindowExpr()));
       }
     } else {
       if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
         aggregation_trees_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
             window_func_context->getDerivedAggregationTreesForIntegerTypeWindowExpr()));
       } else {
         aggregation_trees_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
             window_func_context->getAggregationTreesForIntegerTypeWindowExpr()));
       }
     }
 
     CHECK(aggregation_trees_lv);
     CHECK(invalid_val_lv);
     aggregation_tree_search_func_name += "_aggregation_tree";
     aggregation_tree_getter_func_name += "_aggregation_tree";
 
     // get the aggregate tree of the current partition from a window context
     auto aggregation_trees_ptr = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                      cgen_state_.get(),
                                      &code_generator,
                                      co,
                                      aggregation_trees_lv,
                                      ppi64_type,
                                      WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                      .front();
     auto target_aggregation_tree_lv = cgen_state_->emitCall(
         aggregation_tree_getter_func_name, {aggregation_trees_ptr, partition_index_lv});
 
     // a depth of segment tree
     const auto tree_depth_buf = cgen_state_->llInt(
         reinterpret_cast<int64_t>(window_func_context->getAggregateTreeDepth()));
     const auto tree_depth_buf_ptr = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                         cgen_state_.get(),
                                         &code_generator,
                                         co,
                                         tree_depth_buf,
                                         pi64_type,
                                         WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                         .front();
     const auto current_partition_tree_depth_buf_ptr = cgen_state_->ir_builder_.CreateGEP(
         get_int_type(64, cgen_state_->context_), tree_depth_buf_ptr, partition_index_lv);
     const auto current_partition_tree_depth_lv = cgen_state_->ir_builder_.CreateLoad(
         current_partition_tree_depth_buf_ptr->getType()->getPointerElementType(),
         current_partition_tree_depth_buf_ptr);
 
     // a fanout of the current partition's segment tree
     const auto aggregation_tree_fanout_lv = cgen_state_->llInt(
         static_cast<int64_t>(window_func_context->getAggregateTreeFanout()));
 
     // agg_type
     const auto agg_type_lv =
         cgen_state_->llInt(static_cast<int32_t>(window_func->getKind()));
 
     // send a query to the aggregate tree with the frame range:
     // `frame_start_bound_lv` ~ `frame_end_bound_lv`
     auto res_lv =
         cgen_state_->emitCall(aggregation_tree_search_func_name,
                               {target_aggregation_tree_lv,
                                frame_start_bound_lv,
                                frame_end_bound_lv,
                                current_partition_tree_depth_lv,
                                aggregation_tree_fanout_lv,
                                cgen_state_->llBool(agg_expr_ti.is_decimal()),
                                cgen_state_->llInt((int64_t)agg_expr_ti.get_scale()),
                                invalid_val_lv,
                                null_val_lv,
                                agg_type_lv});
 
     // handling returned null value if exists
     std::string null_handler_func_name{"handle_null_val_"};
     std::vector<llvm::Value*> null_handler_args{res_lv, null_val_lv};
 
     // determine null_handling function's name
     if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
       // average aggregate function returns a value as a double
       // (and our search* function also returns a double)
       if (agg_expr_ti.is_fp()) {
         // fp type: double null value
         null_handler_func_name += "double_double";
       } else {
         // non-fp type: int64_t null type
         null_handler_func_name += "double_int64_t";
       }
     } else if (agg_expr_ti.is_fp()) {
       // fp type: double null value
       null_handler_func_name += "double_double";
     } else {
       // non-fp type: int64_t null type
       null_handler_func_name += "int64_t_int64_t";
     }
     null_handler_func_name += "_window_framing_agg";
 
     // prepare null_val
     if (window_func->getKind() == SqlWindowFunctionKind::COUNT) {
       if (agg_expr_ti.is_fp()) {
         null_handler_args.push_back(cgen_state_->llFp((double)0));
       } else {
         null_handler_args.push_back(cgen_state_->llInt((int64_t)0));
       }
     } else if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
       null_handler_args.push_back(cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE)));
     } else {
       null_handler_args.push_back(cgen_state_->castToTypeIn(window_func_null_val, 64));
     }
     res_lv = cgen_state_->emitCall(null_handler_func_name, null_handler_args);
 
     // when AGG_TYPE is double, we get a double type return value we expect an integer
     // type value for the count aggregation
     if (window_func->getKind() == SqlWindowFunctionKind::COUNT && agg_expr_ti.is_fp()) {
       return cgen_state_->ir_builder_.CreateFPToSI(
           res_lv, get_int_type(64, cgen_state_->context_));
     } else if (window_func->getKind() != SqlWindowFunctionKind::COUNT &&
                agg_expr_ti.is_date_in_days()) {
       // we need to decode the "encoded" date column value
       auto date_null_val = get_null_value_by_size(cgen_state_.get(), agg_expr_ti);
       if (date_null_val->getType()->getScalarSizeInBits() != 32) {
         date_null_val = cgen_state_->castToTypeIn(date_null_val, 32);
       }
       return cgen_state_->emitCall("fixed_width_date_decode",
                                    {res_lv, date_null_val, null_val_lv});
     }
     return res_lv;
   } else {
     auto agg_name = get_window_agg_name(window_func->getKind(), window_func_ti);
     Analyzer::Expr* arg_target_expr;
     std::vector<llvm::Value*> agg_func_args{aggregate_state};
     auto modified_window_func_null_val = window_func_null_val;
     if (args.empty() ||
         (window_func->getKind() == SqlWindowFunctionKind::COUNT &&
          dynamic_cast<Analyzer::Constant*>(args.front().get()) != nullptr)) {
       // a count aggregation without an expression: COUNT(1) or COUNT(*)
       agg_func_args.push_back(cgen_state_->llInt(int64_t(1)));
     } else {
       // we use #base_agg_func_name##_skip_val agg function
       // i.e.,int64_t agg_sum_skip_val(int64_t* agg, int64_t val, int64_t skip_val)
       arg_target_expr = args.front().get();
       const auto arg_lvs = code_generator.codegen(arg_target_expr, true, co);
       CHECK_EQ(arg_lvs.size(), size_t(1));
       // handling current row's value
       auto crt_val = arg_lvs.front();
       if ((window_func->getKind() == SqlWindowFunctionKind::SUM ||
            window_func->getKind() == SqlWindowFunctionKind::SUM_IF) &&
           !window_func_ti.is_fp()) {
         crt_val = code_generator.codegenCastBetweenIntTypes(
             arg_lvs.front(), args.front()->get_type_info(), window_func_ti, false);
       }
       agg_func_args.push_back(window_func_ti.get_type() == kFLOAT
                                   ? crt_val
                                   : cgen_state_->castToTypeIn(crt_val, 64));
       // handle null value and conditional value for conditional aggregates if necessary
       llvm::Value* cond_lv{nullptr};
       if (window_function_conditional_aggregate(window_func->getKind())) {
         switch (window_func->getKind()) {
           case SqlWindowFunctionKind::COUNT_IF:
             // COUNT_IF has a single condition expr which is always bool type
             modified_window_func_null_val = cgen_state_->castToTypeIn(
                 cgen_state_->inlineNull(SQLTypeInfo(kTINYINT)), 64);
             break;
           case SqlWindowFunctionKind::SUM_IF: {
             // FP type input col uses its own null value depending on the type
             // otherwise (integer type input col), we use 8-byte type
             if (args.front()->get_type_info().is_integer()) {
               agg_func_args[1] = cgen_state_->castToTypeIn(agg_func_args[1], 64);
               // keep the null value but casting its type to 8-byte
               modified_window_func_null_val =
                   cgen_state_->castToTypeIn(window_func_null_val, 64);
             }
             auto cond_expr_lv = code_generator.codegen(args[1].get(), true, co).front();
             cond_lv =
                 codegenConditionalAggregateCondValSelector(cond_expr_lv, kSUM_IF, co);
           }
           default:
             break;
         }
       }
       agg_name += "_skip_val";
       agg_func_args.push_back(modified_window_func_null_val);
       if (cond_lv) {
         agg_func_args.push_back(cond_lv);
       }
     }
     cgen_state_->emitCall(agg_name, agg_func_args);
     if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
       codegenWindowAvgEpilogue(
           &code_generator, co, agg_func_args[1], window_func_null_val);
     }
     return codegenAggregateWindowState(&code_generator, co, aggregate_state);
   }
 }

Here is the call graph for this function:

void Executor::codegenWindowFunctionStateInit	(	CodeGenerator *	code_generator,
		const CompilationOptions &	co,
		llvm::Value *	aggregate_state
	)

private

Definition at line 339 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, COUNT, COUNT_IF, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

                                                                           {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto window_func_context =
       WindowProjectNodeContext::getActiveWindowFunctionContext(this);
   const auto window_func = window_func_context->getWindowFunction();
   const auto window_func_ti = get_adjusted_window_type_info(window_func);
   const auto window_func_null_val =
       window_func_ti.is_fp()
           ? cgen_state_->inlineFpNull(window_func_ti)
           : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
   llvm::Value* window_func_init_val;
   const auto window_func_kind = window_func_context->getWindowFunction()->getKind();
   if (window_func_kind == SqlWindowFunctionKind::COUNT ||
       window_func_kind == SqlWindowFunctionKind::COUNT_IF) {
     switch (window_func_ti.get_type()) {
       case kFLOAT: {
         window_func_init_val = cgen_state_->llFp(float(0));
         break;
       }
       case kDOUBLE: {
         window_func_init_val = cgen_state_->llFp(double(0));
         break;
       }
       default: {
         window_func_init_val = cgen_state_->llInt(int64_t(0));
         break;
       }
     }
   } else {
     window_func_init_val = window_func_null_val;
   }
   const auto pi32_type =
       llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
   switch (window_func_ti.get_type()) {
     case kDOUBLE: {
       cgen_state_->emitCall("agg_id_double", {aggregate_state, window_func_init_val});
       break;
     }
     case kFLOAT: {
       aggregate_state =
           cgen_state_->ir_builder_.CreateBitCast(aggregate_state, pi32_type);
       cgen_state_->emitCall("agg_id_float", {aggregate_state, window_func_init_val});
       break;
     }
     default: {
       cgen_state_->emitCall("agg_id", {aggregate_state, window_func_init_val});
       break;
     }
   }
 }

Here is the call graph for this function:

llvm::Value * Executor::codegenWindowNavigationFunctionOnFrame ( const CompilationOptions & co )

private

Definition at line 392 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, CHECK, CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowFrameBoundFuncArgs::current_partition_start_offset_lv, FIRST_VALUE_IN_FRAME, FORWARD_FILL, anonymous_namespace{WindowFunctionIR.cpp}::get_col_type_name_by_size(), get_fp_type(), get_int_type(), anonymous_namespace{WindowFunctionIR.cpp}::get_null_value_by_size(), anonymous_namespace{WindowFunctionIR.cpp}::get_null_value_by_size_with_encoding(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kENCODING_DATE_IN_DAYS, kSecsPerDay, LAG_IN_FRAME, LAST_VALUE_IN_FRAME, LEAD_IN_FRAME, NTH_VALUE_IN_FRAME, WindowFunctionContext::NUM_EXECUTION_DEVICES, and UNREACHABLE.

                                   {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto window_func_context =
       WindowProjectNodeContext::getActiveWindowFunctionContext(this);
   const auto window_func = window_func_context->getWindowFunction();
   const auto window_func_kind = window_func->getKind();
   const auto& args = window_func->getArgs();
   CHECK(args.size() >= 1 && args.size() <= 3);
   CodeGenerator code_generator(this);
 
   const auto target_col_ti = args.front()->get_type_info();
   const auto target_col_size = target_col_ti.get_size();
   const auto target_col_type_name =
       get_col_type_name_by_size(target_col_size, target_col_ti.is_fp());
   const auto target_col_logical_type_name = get_col_type_name_by_size(
       window_func->get_type_info().get_size(), window_func->get_type_info().is_fp());
 
   // when target_column is fixed encoded, we store the actual column value by
   // considering it, but our resultset analyzer only considers the type without encoding
   // scheme so we handle them separately
   auto logical_null_val_lv =
       get_null_value_by_size(cgen_state_.get(), window_func->get_type_info());
   auto target_col_null_val_lv =
       get_null_value_by_size_with_encoding(cgen_state_.get(), target_col_ti);
   if (window_func_context->elementCount() == 0) {
     // we do not need to generate a code for an empty input table
     return target_col_null_val_lv;
   }
 
   auto current_row_pos_lv = code_generator.posArg(nullptr);
   auto partition_index_lv = codegenCurrentPartitionIndex(
       window_func_context, &code_generator, co, current_row_pos_lv);
 
   // load window function input expression; target_column
   size_t target_col_size_in_byte = target_col_size * 8;
   llvm::Type* col_buf_ptr_type =
       target_col_ti.is_fp()
           ? get_fp_type(target_col_size_in_byte, cgen_state_->context_)
           : get_int_type(target_col_size_in_byte, cgen_state_->context_);
   auto col_buf_type = llvm::PointerType::get(col_buf_ptr_type, 0);
   auto target_col_buf_ptr_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
       window_func_context->getColumnBufferForWindowFunctionExpressions().front()));
   auto target_col_buf_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
                                cgen_state_.get(),
                                &code_generator,
                                co,
                                target_col_buf_ptr_lv,
                                col_buf_type,
                                WindowFunctionContext::NUM_EXECUTION_DEVICES)
                                .front();
 
   // prepare various buffer ptrs related to the window partition
   auto partition_buf_ptrs = codegenLoadPartitionBuffers(
       window_func_context, &code_generator, co, partition_index_lv);
 
   // null value of the ordering column
   const auto order_key_buf_ti =
       window_func_context->getOrderKeyColumnBufferTypes().front();
   auto const ordering_spec = window_func->getCollation().front();
   auto order_key_col_null_val_lv =
       get_null_value_by_size_with_encoding(cgen_state_.get(), order_key_buf_ti);
 
   // load ordering column
   auto [order_col_type_name, order_key_buf_ptr_lv] =
       codegenLoadOrderKeyBufPtr(window_func_context, &code_generator, co);
 
   // null range
   auto [null_start_pos_lv, null_end_pos_lv] =
       codegenFrameNullRange(window_func_context, &code_generator, co, partition_index_lv);
 
   // compute a row index of the current row w.r.t the window frame it belongs to
   std::string row_idx_on_frame_func = "compute_";
   row_idx_on_frame_func += order_col_type_name;
   row_idx_on_frame_func += ordering_spec.is_desc ? "_greater_equal" : "_less_equal";
   row_idx_on_frame_func += "_current_row_idx_in_frame";
   auto int64_t_one_val_lv = cgen_state_->llInt((int64_t)1);
   auto nulls_first_lv = cgen_state_->llBool(ordering_spec.nulls_first);
   auto cur_row_idx_in_frame_lv =
       cgen_state_->emitCall(row_idx_on_frame_func,
                             {partition_buf_ptrs.num_elem_current_partition_lv,
                              current_row_pos_lv,
                              order_key_buf_ptr_lv,
                              partition_buf_ptrs.target_partition_rowid_ptr_lv,
                              partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
                              order_key_col_null_val_lv,
                              nulls_first_lv,
                              null_start_pos_lv,
                              null_end_pos_lv});
 
   if (window_func->isMissingValueFillingFunction()) {
     // We classify both FORWARD_FILL and BACKWARD_FILL as window frame navigate function
     // b/c they need to determine the current row index within a sorted partition
     // (as we did for window frame navigation functions) to compute the correct and
     // consistent resultset Otherwise, the query result may differ per execution due to
     // missing table ordering Now we know the current row's index in the sorted
     // partition (cur_row_idx_in_frame_lv), so we can return by calling the runtime
     // function with the index we computed
     std::string func_name = "fill_" + target_col_type_name + "_missing_value";
 
     llvm::Value* forward_fill_lv =
         cgen_state_->llBool(window_func_kind == SqlWindowFunctionKind::FORWARD_FILL);
     return cgen_state_->emitCall(func_name,
                                  {cur_row_idx_in_frame_lv,
                                   target_col_null_val_lv,
                                   target_col_buf_lv,
                                   partition_buf_ptrs.num_elem_current_partition_lv,
                                   partition_buf_ptrs.target_partition_rowid_ptr_lv,
                                   partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
                                   forward_fill_lv});
   }
 
   // compute frame bound for the current row
   auto [frame_start_bound_expr_lv, frame_end_bound_expr_lv] =
       codegenFrameBoundRange(window_func, code_generator, co);
 
   // compute frame bound for the current row
   auto const int64_t_zero_val_lv = cgen_state_->llInt((int64_t)0);
   WindowFrameBoundFuncArgs WindowFrameBoundFuncArgs{
       frame_start_bound_expr_lv,
       frame_end_bound_expr_lv,
       window_func->hasRangeModeFraming() ? current_row_pos_lv : cur_row_idx_in_frame_lv,
       nullptr,
       window_func->hasRangeModeFraming()
           ? int64_t_zero_val_lv
           : partition_buf_ptrs.current_partition_start_offset_lv,
       int64_t_zero_val_lv,
       int64_t_one_val_lv,
       partition_buf_ptrs.num_elem_current_partition_lv,
       order_key_buf_ptr_lv,
       "",
       partition_buf_ptrs.target_partition_rowid_ptr_lv,
       partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
       nulls_first_lv,
       null_start_pos_lv,
       null_end_pos_lv};
   auto [frame_start_bound_lv, frame_end_bound_lv] =
       codegenWindowFrameBounds(window_func_context,
                                window_func->getFrameStartBound(),
                                window_func->getFrameEndBound(),
                                order_key_col_null_val_lv,
                                WindowFrameBoundFuncArgs,
                                code_generator);
 
   // compute the index of the current row in frame it belongs to
   llvm::Value* modified_cur_row_idx_in_frame_lv{nullptr};
   llvm::Value* offset_lv{nullptr};
   switch (window_func_kind) {
     case SqlWindowFunctionKind::LAG_IN_FRAME:
       offset_lv = cgen_state_->castToTypeIn(
           code_generator.codegen(args[1].get(), true, co)[0], 64);
       modified_cur_row_idx_in_frame_lv =
           cgen_state_->ir_builder_.CreateSub(cur_row_idx_in_frame_lv, offset_lv);
       break;
     case SqlWindowFunctionKind::LEAD_IN_FRAME:
       offset_lv = cgen_state_->castToTypeIn(
           code_generator.codegen(args[1].get(), true, co)[0], 64);
       modified_cur_row_idx_in_frame_lv =
           cgen_state_->ir_builder_.CreateAdd(cur_row_idx_in_frame_lv, offset_lv);
       break;
     case SqlWindowFunctionKind::FIRST_VALUE_IN_FRAME:
       modified_cur_row_idx_in_frame_lv = frame_start_bound_lv;
       break;
     case SqlWindowFunctionKind::LAST_VALUE_IN_FRAME:
       modified_cur_row_idx_in_frame_lv = frame_end_bound_lv;
       break;
     case SqlWindowFunctionKind::NTH_VALUE_IN_FRAME: {
       offset_lv = cgen_state_->castToTypeIn(
           code_generator.codegen(args[1].get(), true, co)[0], 64);
       auto candidate_offset_lv =
           cgen_state_->ir_builder_.CreateAdd(frame_start_bound_lv, offset_lv);
       auto out_of_frame_bound_lv =
           cgen_state_->ir_builder_.CreateICmpSGT(candidate_offset_lv, frame_end_bound_lv);
       modified_cur_row_idx_in_frame_lv = cgen_state_->ir_builder_.CreateSelect(
           out_of_frame_bound_lv, cgen_state_->llInt((int64_t)-1), candidate_offset_lv);
       break;
     }
     default:
       UNREACHABLE() << "Unsupported window function to navigate a window frame.";
   }
   CHECK(modified_cur_row_idx_in_frame_lv);
 
   // get the target column value in the frame w.r.t the offset
   std::string target_func_name = "get_";
   target_func_name += target_col_type_name + "_value_";
   target_func_name += target_col_logical_type_name + "_type_";
   target_func_name += "in_frame";
   auto res_lv =
       cgen_state_->emitCall(target_func_name,
                             {modified_cur_row_idx_in_frame_lv,
                              frame_start_bound_lv,
                              frame_end_bound_lv,
                              target_col_buf_lv,
                              partition_buf_ptrs.target_partition_rowid_ptr_lv,
                              partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
                              logical_null_val_lv,
                              target_col_null_val_lv});
   if (target_col_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
     res_lv = cgen_state_->emitCall(
         "encode_date",
         {res_lv, logical_null_val_lv, cgen_state_->llInt((int64_t)kSecsPerDay)});
   }
   CHECK(res_lv);
   return res_lv;
 }

Here is the call graph for this function:

std::pair< llvm::BasicBlock , llvm::Value > Executor::codegenWindowResetStateControlFlow	(	CodeGenerator *	code_generator,
		const CompilationOptions &	co
	)

private

Definition at line 299 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, CodegenUtil::createPtrWithHoistedMemoryAddr(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), WindowFunctionContext::NUM_EXECUTION_DEVICES, CodeGenerator::posArg(), and CodeGenerator::toBool().

                                   {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto window_func_context =
       WindowProjectNodeContext::getActiveWindowFunctionContext(this);
   auto aggregate_state = aggregateWindowStatePtr(code_generator, co);
   const auto bitset = cgen_state_->llInt(
       reinterpret_cast<const int64_t>(window_func_context->partitionStart()));
   const auto bitset_lv =
       CodegenUtil::createPtrWithHoistedMemoryAddr(
           cgen_state_.get(),
           code_generator,
           co,
           bitset,
           llvm::PointerType::get(get_int_type(8, cgen_state_->context_), 0),
           WindowFunctionContext::NUM_EXECUTION_DEVICES)
           .front();
   const auto min_val = cgen_state_->llInt(int64_t(0));
   const auto max_val = cgen_state_->llInt(window_func_context->elementCount() - 1);
   const auto null_val = cgen_state_->llInt(inline_int_null_value<int64_t>());
   const auto null_bool_val = cgen_state_->llInt<int8_t>(inline_int_null_value<int8_t>());
   const auto reset_state =
       code_generator->toBool(cgen_state_->emitCall("bit_is_set",
                                                    {bitset_lv,
                                                     code_generator->posArg(nullptr),
                                                     min_val,
                                                     max_val,
                                                     null_val,
                                                     null_bool_val}));
   const auto reset_state_true_bb = llvm::BasicBlock::Create(
       cgen_state_->context_, "reset_state.true", cgen_state_->current_func_);
   const auto reset_state_false_bb = llvm::BasicBlock::Create(
       cgen_state_->context_, "reset_state.false", cgen_state_->current_func_);
   cgen_state_->ir_builder_.CreateCondBr(
       reset_state, reset_state_true_bb, reset_state_false_bb);
   cgen_state_->ir_builder_.SetInsertPoint(reset_state_true_bb);
   return std::make_pair(reset_state_false_bb, aggregate_state);
 }

Here is the call graph for this function:

ResultSetPtr Executor::collectAllDeviceResults	(	SharedKernelContext &	shared_context,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const QueryMemoryDescriptor &	query_mem_desc,
		const ExecutorDeviceType	device_type,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner
	)

private

Definition at line 2715 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::build_row_for_empty_input(), collectAllDeviceShardedTopResults(), DEBUG_TIMER, SharedKernelContext::getFragmentResults(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, heavyai::NonGroupedAggregate, reduceMultiDeviceResults(), reduceSpeculativeTopN(), GroupByAndAggregate::shard_count_for_top_groups(), RelAlgExecutionUnit::target_exprs, and use_speculative_top_n().

Referenced by executeWorkUnitImpl().

                                                         {
   auto timer = DEBUG_TIMER(__func__);
   auto& result_per_device = shared_context.getFragmentResults();
   if (result_per_device.empty() && query_mem_desc.getQueryDescriptionType() ==
                                        QueryDescriptionType::NonGroupedAggregate) {
     return build_row_for_empty_input(
         ra_exe_unit.target_exprs, query_mem_desc, device_type);
   }
   if (use_speculative_top_n(ra_exe_unit, query_mem_desc)) {
     try {
       return reduceSpeculativeTopN(
           ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
     } catch (const std::bad_alloc&) {
       throw SpeculativeTopNFailed("Failed during multi-device reduction.");
     }
   }
   const auto shard_count =
       device_type == ExecutorDeviceType::GPU
           ? GroupByAndAggregate::shard_count_for_top_groups(ra_exe_unit)
           : 0;
 
   if (shard_count && !result_per_device.empty()) {
     return collectAllDeviceShardedTopResults(shared_context, ra_exe_unit, device_type);
   }
   return reduceMultiDeviceResults(
       ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

ResultSetPtr Executor::collectAllDeviceShardedTopResults	(	SharedKernelContext &	shared_context,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const ExecutorDeviceType	device_type
	)		const

private

Definition at line 2830 of file Execute.cpp.

References blockSize(), CHECK, CHECK_EQ, CHECK_LE, SharedKernelContext::getFragmentResults(), gridSize(), SortInfo::limit, SortInfo::offset, SortInfo::order_entries, anonymous_namespace{Execute.cpp}::permute_storage_columnar(), anonymous_namespace{Execute.cpp}::permute_storage_row_wise(), run_benchmark_import::result, and RelAlgExecutionUnit::sort_info.

Referenced by collectAllDeviceResults().

                                                 {
   auto& result_per_device = shared_context.getFragmentResults();
   const auto first_result_set = result_per_device.front().first;
   CHECK(first_result_set);
   auto top_query_mem_desc = first_result_set->getQueryMemDesc();
   CHECK(!top_query_mem_desc.hasInterleavedBinsOnGpu());
   const auto top_n =
       ra_exe_unit.sort_info.limit.value_or(0) + ra_exe_unit.sort_info.offset;
   top_query_mem_desc.setEntryCount(0);
   for (auto& result : result_per_device) {
     const auto result_set = result.first;
     CHECK(result_set);
     result_set->sort(ra_exe_unit.sort_info.order_entries, top_n, device_type, this);
     size_t new_entry_cnt = top_query_mem_desc.getEntryCount() + result_set->rowCount();
     top_query_mem_desc.setEntryCount(new_entry_cnt);
   }
   auto top_result_set = std::make_shared<ResultSet>(first_result_set->getTargetInfos(),
                                                     first_result_set->getDeviceType(),
                                                     top_query_mem_desc,
                                                     first_result_set->getRowSetMemOwner(),
                                                     blockSize(),
                                                     gridSize());
   auto top_storage = top_result_set->allocateStorage();
   size_t top_output_row_idx{0};
   for (auto& result : result_per_device) {
     const auto result_set = result.first;
     CHECK(result_set);
     const auto& top_permutation = result_set->getPermutationBuffer();
     CHECK_LE(top_permutation.size(), top_n);
     if (top_query_mem_desc.didOutputColumnar()) {
       top_output_row_idx = permute_storage_columnar(result_set->getStorage(),
                                                     result_set->getQueryMemDesc(),
                                                     top_storage,
                                                     top_output_row_idx,
                                                     top_query_mem_desc,
                                                     top_permutation);
     } else {
       top_output_row_idx = permute_storage_row_wise(result_set->getStorage(),
                                                     top_storage,
                                                     top_output_row_idx,
                                                     top_query_mem_desc,
                                                     top_permutation);
     }
   }
   CHECK_EQ(top_output_row_idx, top_query_mem_desc.getEntryCount());
   return top_result_set;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

bool Executor::compileBody	(	const RelAlgExecutionUnit &	ra_exe_unit,
		GroupByAndAggregate &	group_by_and_aggregate,
		QueryMemoryDescriptor &	query_mem_desc,
		const CompilationOptions &	co,
		const GpuSharedMemoryContext &	gpu_smem_context = `{}`
	)

private

Definition at line 3347 of file NativeCodegen.cpp.

                                                                            {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
 
   // Switch the code generation into a separate filter function if enabled.
   // Note that accesses to function arguments are still codegenned from the
   // row function's arguments, then later automatically forwarded and
   // remapped into filter function arguments by redeclareFilterFunction().
   cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
   llvm::Value* loop_done{nullptr};
   std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
   if (cgen_state_->filter_func_) {
     if (cgen_state_->row_func_bb_->getName() == "loop_body") {
       auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
       cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
                                               row_func_entry_bb->begin());
       loop_done = cgen_state_->ir_builder_.CreateAlloca(
           get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
       cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
       cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
     }
     cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
     cgen_state_->current_func_ = cgen_state_->filter_func_;
     fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
   }
 
   // generate the code for the filter
   std::vector<Analyzer::Expr*> primary_quals;
   std::vector<Analyzer::Expr*> deferred_quals;
   bool short_circuited = CodeGenerator::prioritizeQuals(
       ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
   if (short_circuited) {
     VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
             << "short-circuited and deferred " << std::to_string(deferred_quals.size())
             << " quals";
   }
   llvm::Value* filter_lv = cgen_state_->llBool(true);
   CodeGenerator code_generator(this);
   for (auto expr : primary_quals) {
     // Generate the filter for primary quals
     auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
     filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
   }
   CHECK(filter_lv->getType()->isIntegerTy(1));
   llvm::BasicBlock* sc_false{nullptr};
   if (!deferred_quals.empty()) {
     auto sc_true = llvm::BasicBlock::Create(
         cgen_state_->context_, "sc_true", cgen_state_->current_func_);
     sc_false = llvm::BasicBlock::Create(
         cgen_state_->context_, "sc_false", cgen_state_->current_func_);
     cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
     cgen_state_->ir_builder_.SetInsertPoint(sc_false);
     if (ra_exe_unit.join_quals.empty()) {
       cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
     }
     cgen_state_->ir_builder_.SetInsertPoint(sc_true);
     filter_lv = cgen_state_->llBool(true);
   }
   for (auto expr : deferred_quals) {
     filter_lv = cgen_state_->ir_builder_.CreateAnd(
         filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
   }
 
   CHECK(filter_lv->getType()->isIntegerTy(1));
   auto ret = group_by_and_aggregate.codegen(
       filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
 
   // Switch the code generation back to the row function if a filter
   // function was enabled.
   if (cgen_state_->filter_func_) {
     if (cgen_state_->row_func_bb_->getName() == "loop_body") {
       cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
       cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
     }
 
     cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
     cgen_state_->current_func_ = cgen_state_->row_func_;
     cgen_state_->filter_func_call_ =
         cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
 
     // Create real filter function declaration after placeholder call
     // is emitted.
     redeclareFilterFunction();
 
     if (cgen_state_->row_func_bb_->getName() == "loop_body") {
       auto loop_done_true = llvm::BasicBlock::Create(
           cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
       auto loop_done_false = llvm::BasicBlock::Create(
           cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
       auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
           loop_done->getType()->getPointerElementType(), loop_done);
       cgen_state_->ir_builder_.CreateCondBr(
           loop_done_flag, loop_done_true, loop_done_false);
       cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
       cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
       cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
     } else {
       cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
     }
   }
   return ret;
 }

std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > Executor::compileWorkUnit	(	const std::vector< InputTableInfo > &	query_infos,
		const PlanState::DeletedColumnsMap &	deleted_cols_map,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const CompilationOptions &	co,
		const ExecutionOptions &	eo,
		const CudaMgr_Namespace::CudaMgr *	cuda_mgr,
		const bool	allow_lazy_fetch,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		const size_t	max_groups_buffer_entry_count,
		const int8_t	crt_min_byte_width,
		const bool	has_cardinality_estimation,
		ColumnCacheMap &	column_cache,
		RenderInfo *	render_info = `nullptr`
	)

private

Definition at line 2816 of file NativeCodegen.cpp.

                                                    {
   auto timer = DEBUG_TIMER(__func__);
 
   if (co.device_type == ExecutorDeviceType::GPU) {
     if (!cuda_mgr) {
       throw QueryMustRunOnCpu();
     }
   }
 
 #ifndef NDEBUG
   static std::uint64_t counter = 0;
   ++counter;
   VLOG(1) << "CODEGEN #" << counter << ":";
   LOG(IR) << "CODEGEN #" << counter << ":";
   LOG(PTX) << "CODEGEN #" << counter << ":";
   LOG(ASM) << "CODEGEN #" << counter << ":";
 #endif
 
   // cgenstate_manager uses RAII pattern to manage the live time of
   // CgenState instances.
   Executor::CgenStateManager cgenstate_manager(*this,
                                                allow_lazy_fetch,
                                                query_infos,
                                                deleted_cols_map,
                                                &ra_exe_unit);  // locks compilation_mutex
   addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
 
   GroupByAndAggregate group_by_and_aggregate(
       this,
       co.device_type,
       ra_exe_unit,
       query_infos,
       row_set_mem_owner,
       has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
                                  : std::nullopt);
   auto query_mem_desc =
       group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
                                                        max_groups_buffer_entry_guess,
                                                        crt_min_byte_width,
                                                        render_info,
                                                        eo.output_columnar_hint);
 
   if (query_mem_desc->getQueryDescriptionType() ==
           QueryDescriptionType::GroupByBaselineHash &&
       !has_cardinality_estimation && (!render_info || !render_info->isInSitu()) &&
       !eo.just_explain) {
     const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
     throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
   }
 
   const bool output_columnar = query_mem_desc->didOutputColumnar();
   const bool gpu_shared_mem_optimization =
       is_gpu_shared_mem_supported(query_mem_desc.get(),
                                   ra_exe_unit,
                                   cuda_mgr,
                                   co.device_type,
                                   cuda_mgr ? this->blockSize() : 1,
                                   cuda_mgr ? this->numBlocksPerMP() : 1);
   if (gpu_shared_mem_optimization) {
     // disable interleaved bins optimization on the GPU
     query_mem_desc->setHasInterleavedBinsOnGpu(false);
     LOG(DEBUG1) << "GPU shared memory is used for the " +
                        query_mem_desc->queryDescTypeToString() + " query(" +
                        std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
                                                              query_mem_desc.get())) +
                        " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
   }
 
   const GpuSharedMemoryContext gpu_smem_context(
       get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
 
   if (co.device_type == ExecutorDeviceType::GPU) {
     const size_t num_count_distinct_descs =
         query_mem_desc->getCountDistinctDescriptorsSize();
     for (size_t i = 0; i < num_count_distinct_descs; i++) {
       const auto& count_distinct_descriptor =
           query_mem_desc->getCountDistinctDescriptor(i);
       if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::UnorderedSet ||
           (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
            !co.hoist_literals)) {
         throw QueryMustRunOnCpu();
       }
     }
 
     // we currently do not support varlen projection based on baseline groupby when
     // 1) target table is multi-fragmented and 2) multiple gpus are involved for query
     // processing in this case, we punt the query to cpu to avoid server crash
     for (const auto expr : ra_exe_unit.target_exprs) {
       if (auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
         bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 : false;
         if (gby_expr->get_aggtype() == SQLAgg::kSAMPLE && has_multiple_gpus &&
             !g_leaf_count) {
           std::set<const Analyzer::ColumnVar*,
                    bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
               colvar_set(Analyzer::ColumnVar::colvar_comp);
           gby_expr->collect_column_var(colvar_set, true);
           for (const auto cv : colvar_set) {
             if (cv->get_type_info().is_varlen()) {
               const auto tbl_key = cv->getTableKey();
               std::for_each(query_infos.begin(),
                             query_infos.end(),
                             [&tbl_key](const InputTableInfo& input_table_info) {
                               if (input_table_info.table_key == tbl_key &&
                                   input_table_info.info.fragments.size() > 1) {
                                 throw QueryMustRunOnCpu();
                               }
                             });
             }
           }
         }
       }
     }
   }
 
   // Read the module template and target either CPU or GPU
   // by binding the stream position functions to the right implementation:
   // stride access for GPU, contiguous for CPU
   CHECK(cgen_state_->module_ == nullptr);
   cgen_state_->set_module_shallow_copy(get_rt_module(), /*always_clone=*/true);
 
   auto is_gpu = co.device_type == ExecutorDeviceType::GPU;
   if (is_gpu) {
     cgen_state_->module_->setDataLayout(get_gpu_data_layout());
     cgen_state_->module_->setTargetTriple(get_gpu_target_triple_string());
   }
   if (has_udf_module(/*is_gpu=*/is_gpu)) {
     CodeGenerator::link_udf_module(
         get_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
   }
   if (has_rt_udf_module(/*is_gpu=*/is_gpu)) {
     CodeGenerator::link_udf_module(
         get_rt_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
   }
 
   AUTOMATIC_IR_METADATA(cgen_state_.get());
 
   auto agg_fnames =
       get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
 
   const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
 
   const bool is_group_by{query_mem_desc->isGroupBy()};
   auto [query_func, row_func_call] = is_group_by
                                          ? query_group_by_template(cgen_state_->module_,
                                                                    co.hoist_literals,
                                                                    *query_mem_desc,
                                                                    co.device_type,
                                                                    ra_exe_unit.scan_limit,
                                                                    gpu_smem_context)
                                          : query_template(cgen_state_->module_,
                                                           agg_slot_count,
                                                           co.hoist_literals,
                                                           !!ra_exe_unit.estimator,
                                                           gpu_smem_context);
   bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
   bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
   bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
 
   cgen_state_->query_func_ = query_func;
   cgen_state_->row_func_call_ = row_func_call;
   cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
       &query_func->getEntryBlock().front());
 
   // Generate the function signature and column head fetches s.t.
   // double indirection isn't needed in the inner loop
   auto& fetch_bb = query_func->front();
   llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
   fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
   auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
                                               get_arg_by_name(query_func, "byte_stream"),
                                               fetch_ir_builder,
                                               cgen_state_->context_);
   CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
 
   cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
                                                is_group_by ? 0 : agg_slot_count,
                                                co.hoist_literals,
                                                cgen_state_->module_,
                                                cgen_state_->context_);
   CHECK(cgen_state_->row_func_);
   cgen_state_->row_func_bb_ =
       llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
 
   if (g_enable_filter_function) {
     auto filter_func_ft =
         llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
     cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
                                                        llvm::Function::ExternalLinkage,
                                                        "filter_func",
                                                        cgen_state_->module_);
     CHECK(cgen_state_->filter_func_);
     cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
         cgen_state_->context_, "entry", cgen_state_->filter_func_);
   }
 
   cgen_state_->current_func_ = cgen_state_->row_func_;
   cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
 
   preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
   RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
   const auto join_loops =
       buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
 
   plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
   for (auto& simple_qual : ra_exe_unit.simple_quals) {
     plan_state_->addSimpleQual(simple_qual);
   }
   const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
   if (is_not_deleted_bb) {
     cgen_state_->row_func_bb_ = is_not_deleted_bb;
   }
   if (!join_loops.empty()) {
     codegenJoinLoops(join_loops,
                      body_execution_unit,
                      group_by_and_aggregate,
                      query_func,
                      cgen_state_->row_func_bb_,
                      *(query_mem_desc.get()),
                      co,
                      eo);
   } else {
     const bool can_return_error = compileBody(
         ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
     if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
         eo.allow_runtime_query_interrupt) {
       createErrorCheckControlFlow(query_func,
                                   eo.with_dynamic_watchdog,
                                   eo.allow_runtime_query_interrupt,
                                   join_loops,
                                   co.device_type,
                                   group_by_and_aggregate.query_infos_);
     }
   }
   std::vector<llvm::Value*> hoisted_literals;
 
   if (co.hoist_literals) {
     VLOG(1) << "number of hoisted literals: "
             << cgen_state_->query_func_literal_loads_.size()
             << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
             << " bytes";
   }
 
   if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
     // we have some hoisted literals...
     hoisted_literals = inlineHoistedLiterals();
   }
 
   // replace the row func placeholder call with the call to the actual row func
   std::vector<llvm::Value*> row_func_args;
   for (size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
     row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
   }
   row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
   row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
   row_func_args.push_back(get_arg_by_name(query_func, "row_func_mgr"));
   // push hoisted literals arguments, if any
   row_func_args.insert(
       row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
   llvm::ReplaceInstWithInst(
       cgen_state_->row_func_call_,
       llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
 
   // replace the filter func placeholder call with the call to the actual filter func
   if (cgen_state_->filter_func_) {
     std::vector<llvm::Value*> filter_func_args;
     for (auto arg_it = cgen_state_->filter_func_args_.begin();
          arg_it != cgen_state_->filter_func_args_.end();
          ++arg_it) {
       filter_func_args.push_back(arg_it->first);
     }
     llvm::ReplaceInstWithInst(
         cgen_state_->filter_func_call_,
         llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
   }
 
   // Aggregate
   plan_state_->init_agg_vals_ =
       init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
 
   /*
    * If we have decided to use GPU shared memory (decision is not made here), then
    * we generate proper code for extra components that it needs (buffer initialization and
    * gpu reduction from shared memory to global memory). We then replace these functions
    * into the already compiled query_func (replacing two placeholders, write_back_nop and
    * init_smem_nop). The rest of the code should be as before (row_func, etc.).
    */
   if (gpu_smem_context.isSharedMemoryUsed()) {
     if (query_mem_desc->getQueryDescriptionType() ==
         QueryDescriptionType::GroupByPerfectHash) {
       GpuSharedMemCodeBuilder gpu_smem_code(
           cgen_state_->module_,
           cgen_state_->context_,
           *query_mem_desc,
           target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc),
           plan_state_->init_agg_vals_,
           executor_id_);
       gpu_smem_code.codegen();
       gpu_smem_code.injectFunctionsInto(query_func);
 
       // helper functions are used for caching purposes later
       cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
       cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
       LOG(IR) << gpu_smem_code.toString();
     }
   }
 
   auto multifrag_query_func = cgen_state_->module_->getFunction(
       "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
   CHECK(multifrag_query_func);
 
   if (co.device_type == ExecutorDeviceType::GPU && eo.allow_multifrag) {
     insertErrorCodeChecker(multifrag_query_func,
                            get_index_by_name(query_func, "error_code"),
                            co.hoist_literals,
                            eo.allow_runtime_query_interrupt);
   }
 
   bind_query(query_func,
              "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
              multifrag_query_func,
              cgen_state_->module_);
 
   std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
   if (cgen_state_->filter_func_) {
     root_funcs.push_back(cgen_state_->filter_func_);
   }
   auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
       *cgen_state_->module_, root_funcs, {multifrag_query_func});
 
   // Always inline the row function and the filter function.
   // We don't want register spills in the inner loops.
   // LLVM seems to correctly free up alloca instructions
   // in these functions even when they are inlined.
   mark_function_always_inline(cgen_state_->row_func_);
   if (cgen_state_->filter_func_) {
     mark_function_always_inline(cgen_state_->filter_func_);
   }
 
 #ifndef NDEBUG
   // Add helpful metadata to the LLVM IR for debugging.
   AUTOMATIC_IR_METADATA_DONE();
 #endif
 
   auto const device_str = co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n";
   // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
   std::string llvm_ir =
       serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
       serialize_llvm_object(cgen_state_->row_func_) +
       (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_) : "");
   VLOG(3) << "Unoptimized IR for the " << device_str << "\n" << llvm_ir << "\nEnd of IR";
   if (eo.just_explain && co.explain_type == ExecutorExplainType::Optimized) {
 #ifdef WITH_JIT_DEBUG
     throw std::runtime_error(
         "Explain optimized not available when JIT runtime debug symbols are enabled");
 #else
     // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
     // optimized IR after NVVM reflect
     llvm::legacy::PassManager pass_manager;
     optimize_ir(query_func,
                 cgen_state_->module_,
                 pass_manager,
                 live_funcs,
                 gpu_smem_context.isSharedMemoryUsed(),
                 co);
 #endif  // WITH_JIT_DEBUG
     llvm_ir =
         serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
         serialize_llvm_object(cgen_state_->row_func_) +
         (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
                                    : "");
 #ifndef NDEBUG
     llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
 #endif
   }
   LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
   LOG(IR) << "IR for the " << device_str;
 #ifdef NDEBUG
   LOG(IR) << serialize_llvm_object(query_func)
           << serialize_llvm_object(cgen_state_->row_func_)
           << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
                                         : "")
           << "\nEnd of IR";
 #else
   LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
 #endif
   // Insert calls to "register_buffer_with_executor_rsm" for allocations
   // in runtime functions (i.e. from RBC) without it
   AutoTrackBuffersInRuntimeIR();
 
   // Run some basic validation checks on the LLVM IR before code is generated below.
   verify_function_ir(cgen_state_->row_func_);
   if (cgen_state_->filter_func_) {
     verify_function_ir(cgen_state_->filter_func_);
   }
 
   // Generate final native code from the LLVM IR.
   return std::make_tuple(
       CompilationResult{
           co.device_type == ExecutorDeviceType::CPU
               ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
               : optimizeAndCodegenGPU(query_func,
                                       multifrag_query_func,
                                       live_funcs,
                                       is_group_by || ra_exe_unit.estimator,
                                       cuda_mgr,
                                       gpu_smem_context.isSharedMemoryUsed(),
                                       co),
           cgen_state_->getLiterals(),
           output_columnar,
           llvm_ir,
           std::move(gpu_smem_context)},
       std::move(query_mem_desc));
 }

AggregatedColRange Executor::computeColRangesCache ( const std::unordered_set< PhysicalInput > & phys_inputs )

private

Definition at line 4894 of file Execute.cpp.

References CHECK, Catalog_Namespace::get_metadata_for_column(), getLeafColumnRange(), getTableInfo(), AggregatedColRange::setColRange(), and ExpressionRange::typeSupportsRange().

Referenced by setupCaching().

                                                         {
   AggregatedColRange agg_col_range_cache;
   std::unordered_set<shared::TableKey> phys_table_keys;
   for (const auto& phys_input : phys_inputs) {
     phys_table_keys.emplace(phys_input.db_id, phys_input.table_id);
   }
   std::vector<InputTableInfo> query_infos;
   for (const auto& table_key : phys_table_keys) {
     query_infos.emplace_back(InputTableInfo{table_key, getTableInfo(table_key)});
   }
   for (const auto& phys_input : phys_inputs) {
     auto db_id = phys_input.db_id;
     auto table_id = phys_input.table_id;
     auto column_id = phys_input.col_id;
     const auto cd =
         Catalog_Namespace::get_metadata_for_column({db_id, table_id, column_id});
     CHECK(cd);
     if (ExpressionRange::typeSupportsRange(cd->columnType)) {
       const auto col_var = std::make_unique<Analyzer::ColumnVar>(
           cd->columnType, shared::ColumnKey{db_id, table_id, column_id}, 0);
       const auto col_range = getLeafColumnRange(col_var.get(), query_infos, this, false);
       agg_col_range_cache.setColRange(phys_input, col_range);
     }
   }
   return agg_col_range_cache;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

StringDictionaryGenerations Executor::computeStringDictionaryGenerations ( const std::unordered_set< PhysicalInput > & phys_inputs )

private

Definition at line 4922 of file Execute.cpp.

References CHECK, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), kENCODING_DICT, anonymous_namespace{Execute.cpp}::prepare_string_dictionaries(), and StringDictionaryGenerations::setGeneration().

Referenced by setupCaching().

                                                         {
   StringDictionaryGenerations string_dictionary_generations;
   // Foreign tables may have not populated dictionaries for encoded columns.  If this is
   // the case then we need to populate them here to make sure that the generations are set
   // correctly.
   prepare_string_dictionaries(phys_inputs);
   for (const auto& phys_input : phys_inputs) {
     const auto catalog =
         Catalog_Namespace::SysCatalog::instance().getCatalog(phys_input.db_id);
     CHECK(catalog);
     const auto cd = catalog->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
     CHECK(cd);
     const auto& col_ti =
         cd->columnType.is_array() ? cd->columnType.get_elem_type() : cd->columnType;
     if (col_ti.is_string() && col_ti.get_compression() == kENCODING_DICT) {
       const auto& dict_key = col_ti.getStringDictKey();
       const auto dd = catalog->getMetadataForDict(dict_key.dict_id);
       CHECK(dd && dd->stringDict);
       string_dictionary_generations.setGeneration(dict_key,
                                                   dd->stringDict->storageEntryCount());
     }
   }
   return string_dictionary_generations;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

TableGenerations Executor::computeTableGenerations ( const std::unordered_set< shared::TableKey > & phys_table_keys )

private

Definition at line 4948 of file Execute.cpp.

References getTableInfo(), and TableGenerations::setGeneration().

Referenced by setupCaching().

                                                              {
   TableGenerations table_generations;
   for (const auto& table_key : phys_table_keys) {
     const auto table_info = getTableInfo(table_key);
     table_generations.setGeneration(
         table_key,
         TableGeneration{static_cast<int64_t>(table_info.getPhysicalNumTuples()), 0});
   }
   return table_generations;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

bool Executor::containsLeftDeepOuterJoin ( ) const

inline

Definition at line 614 of file Execute.h.

References cgen_state_.

                                          {
     return cgen_state_->contains_left_deep_outer_join_;
   }

void Executor::createErrorCheckControlFlow	(	llvm::Function *	query_func,
		bool	run_with_dynamic_watchdog,
		bool	run_with_allowing_runtime_interrupt,
		const std::vector< JoinLoop > &	join_loops,
		ExecutorDeviceType	device_type,
		const std::vector< InputTableInfo > &	input_table_infos
	)

private

Definition at line 2029 of file NativeCodegen.cpp.

                                                         {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
 
   // check whether the row processing was successful; currently, it can
   // fail by running out of group by buffer slots
 
   if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
     // when both dynamic watchdog and runtime interrupt turns on
     // we use dynamic watchdog
     run_with_allowing_runtime_interrupt = false;
   }
 
   {
     // disable injecting query interrupt checker if the session info is invalid
     heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(
         executor_session_mutex_);
     if (current_query_session_.empty()) {
       run_with_allowing_runtime_interrupt = false;
     }
   }
 
   llvm::Value* row_count = nullptr;
   if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
       device_type == ExecutorDeviceType::GPU) {
     row_count =
         find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
   }
 
   bool done_splitting = false;
   for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
        ++bb_it) {
     llvm::Value* pos = nullptr;
     for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
       if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
           llvm::isa<llvm::PHINode>(*inst_it)) {
         if (inst_it->getName() == "pos") {
           pos = &*inst_it;
         }
         continue;
       }
       if (!llvm::isa<llvm::CallInst>(*inst_it)) {
         continue;
       }
       auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
       auto const row_func_name = CodegenUtil::getCalledFunctionName(row_func_call);
       if (row_func_name && *row_func_name == "row_process") {
         auto next_inst_it = inst_it;
         ++next_inst_it;
         auto new_bb = bb_it->splitBasicBlock(next_inst_it);
         auto& br_instr = bb_it->back();
         llvm::IRBuilder<> ir_builder(&br_instr);
         llvm::Value* err_lv = &*inst_it;
         llvm::Value* err_lv_returned_from_row_func = nullptr;
         if (run_with_dynamic_watchdog) {
           CHECK(pos);
           llvm::Value* call_watchdog_lv = nullptr;
           if (device_type == ExecutorDeviceType::GPU) {
             // In order to make sure all threads within a block see the same barrier,
             // only those blocks whose none of their threads have experienced the critical
             // edge will go through the dynamic watchdog computation
             CHECK(row_count);
             auto crit_edge_rem =
                 (blockSize() & (blockSize() - 1))
                     ? ir_builder.CreateSRem(
                           row_count,
                           cgen_state_->llInt(static_cast<int64_t>(blockSize())))
                     : ir_builder.CreateAnd(
                           row_count,
                           cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
             auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
             crit_edge_threshold->setName("crit_edge_threshold");
 
             // only those threads where pos < crit_edge_threshold go through dynamic
             // watchdog call
             call_watchdog_lv =
                 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
           } else {
             // CPU path: run watchdog for every 64th row
             auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
             call_watchdog_lv = ir_builder.CreateICmp(
                 llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
           }
           CHECK(call_watchdog_lv);
           auto error_check_bb = bb_it->splitBasicBlock(
               llvm::BasicBlock::iterator(br_instr), ".error_check");
           auto& watchdog_br_instr = bb_it->back();
 
           auto watchdog_check_bb = llvm::BasicBlock::Create(
               cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
           llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
           auto detected_timeout = watchdog_ir_builder.CreateCall(
               cgen_state_->module_->getFunction("dynamic_watchdog"), {});
           auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
               detected_timeout,
               cgen_state_->llInt(int32_t(ErrorCode::OUT_OF_TIME)),
               err_lv);
           watchdog_ir_builder.CreateBr(error_check_bb);
 
           llvm::ReplaceInstWithInst(
               &watchdog_br_instr,
               llvm::BranchInst::Create(
                   watchdog_check_bb, error_check_bb, call_watchdog_lv));
           ir_builder.SetInsertPoint(&br_instr);
           auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
 
           unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
           unified_err_lv->addIncoming(err_lv, &*bb_it);
           err_lv = unified_err_lv;
         } else if (run_with_allowing_runtime_interrupt) {
           CHECK(pos);
           llvm::Value* call_check_interrupt_lv{nullptr};
           llvm::Value* interrupt_err_lv{nullptr};
           llvm::BasicBlock* error_check_bb{nullptr};
           llvm::BasicBlock* interrupt_check_bb{nullptr};
           llvm::Instruction* check_interrupt_br_instr{nullptr};
 
           auto has_loop_join = std::any_of(
               join_loops.begin(), join_loops.end(), [](const JoinLoop& join_loop) {
                 return join_loop.isNestedLoopJoin();
               });
           auto codegen_interrupt_checker = [&]() {
             error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
                                                     ".error_check");
             check_interrupt_br_instr = &bb_it->back();
 
             interrupt_check_bb = llvm::BasicBlock::Create(
                 cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
             llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
             auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
                 cgen_state_->module_->getFunction("check_interrupt"), {});
             interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
                 detected_interrupt,
                 cgen_state_->llInt(int32_t(ErrorCode::INTERRUPTED)),
                 err_lv);
             interrupt_checker_ir_builder.CreateBr(error_check_bb);
           };
           if (has_loop_join) {
             codegen_interrupt_checker();
             CHECK(interrupt_check_bb);
             CHECK(check_interrupt_br_instr);
             llvm::ReplaceInstWithInst(check_interrupt_br_instr,
                                       llvm::BranchInst::Create(interrupt_check_bb));
             ir_builder.SetInsertPoint(&br_instr);
             err_lv = interrupt_err_lv;
           } else {
             if (device_type == ExecutorDeviceType::GPU) {
               // approximate how many times the %pos variable
               // is increased --> the number of iteration
               // here we calculate the # bit shift by considering grid/block/fragment
               // sizes since if we use the fixed one (i.e., per 64-th increment) some CUDA
               // threads cannot enter the interrupt checking block depending on the
               // fragment size --> a thread may not take care of 64 threads if an outer
               // table is not sufficiently large, and so cannot be interrupted
               int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
               int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
               int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
               uint64_t interrupt_checking_freq = 32;
               auto freq_control_knob = g_running_query_interrupt_freq;
               CHECK_GT(freq_control_knob, 0);
               CHECK_LE(freq_control_knob, 1.0);
               if (!input_table_infos.empty()) {
                 const auto& outer_table_info = *input_table_infos.begin();
                 auto num_outer_table_tuples =
                     outer_table_info.info.getFragmentNumTuplesUpperBound();
                 if (num_outer_table_tuples > 0) {
                   // gridSize * blockSize --> pos_step (idx of the next row per thread)
                   // we additionally multiply two to pos_step since the number of
                   // dispatched blocks are double of the gridSize
                   // # tuples (of fragment) / pos_step --> maximum # increment (K)
                   // also we multiply 1 / freq_control_knob to K to control the frequency
                   // So, needs to check the interrupt status more frequently? make K
                   // smaller
                   auto max_inc = uint64_t(
                       floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
                   if (max_inc < 2) {
                     // too small `max_inc`, so this correction is necessary to make
                     // `interrupt_checking_freq` be valid (i.e., larger than zero)
                     max_inc = 2;
                   }
                   auto calibrated_inc =
                       uint64_t(floor(max_inc * (1 - freq_control_knob)));
                   interrupt_checking_freq =
                       uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
                   // add the coverage when interrupt_checking_freq > K
                   // if so, some threads still cannot be branched to the interrupt checker
                   // so we manually use smaller but close to the max_inc as freq
                   if (interrupt_checking_freq > max_inc) {
                     interrupt_checking_freq = max_inc / 2;
                   }
                   if (interrupt_checking_freq < 8) {
                     // such small freq incurs too frequent interrupt status checking,
                     // so we fixup to the minimum freq value at some reasonable degree
                     interrupt_checking_freq = 8;
                   }
                 }
               }
               VLOG(1) << "Set the running query interrupt checking frequency: "
                       << interrupt_checking_freq;
               // check the interrupt flag for every interrupt_checking_freq-th iteration
               llvm::Value* pos_shifted_per_iteration =
                   ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
               auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
                                                               interrupt_checking_freq);
               call_check_interrupt_lv =
                   ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
                                         interrupt_predicate,
                                         cgen_state_->llInt(int64_t(0LL)));
             } else {
               // CPU path: run interrupt checker for every 64th row
               auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
               call_check_interrupt_lv =
                   ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
                                         interrupt_predicate,
                                         cgen_state_->llInt(int64_t(0LL)));
             }
             codegen_interrupt_checker();
             CHECK(call_check_interrupt_lv);
             CHECK(interrupt_err_lv);
             CHECK(interrupt_check_bb);
             CHECK(error_check_bb);
             CHECK(check_interrupt_br_instr);
             llvm::ReplaceInstWithInst(
                 check_interrupt_br_instr,
                 llvm::BranchInst::Create(
                     interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
             ir_builder.SetInsertPoint(&br_instr);
             auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
 
             unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
             unified_err_lv->addIncoming(err_lv, &*bb_it);
             err_lv = unified_err_lv;
           }
         }
         if (!err_lv_returned_from_row_func) {
           err_lv_returned_from_row_func = err_lv;
         }
         if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
           // let kernel execution finish as expected, regardless of the observed error,
           // unless it is from the dynamic watchdog where all threads within that block
           // return together.
           err_lv =
               ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
                                     err_lv,
                                     cgen_state_->llInt(int32_t(ErrorCode::OUT_OF_TIME)));
         } else {
           err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
                                          err_lv,
                                          cgen_state_->llInt(static_cast<int32_t>(0)));
         }
         auto error_bb = llvm::BasicBlock::Create(
             cgen_state_->context_, ".error_exit", query_func, new_bb);
         const auto error_code_arg = get_arg_by_name(query_func, "error_code");
         llvm::CallInst::Create(
             cgen_state_->module_->getFunction("record_error_code"),
             std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
             "",
             error_bb);
         llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
         llvm::ReplaceInstWithInst(&br_instr,
                                   llvm::BranchInst::Create(error_bb, new_bb, err_lv));
         done_splitting = true;
         break;
       }
     }
   }
   CHECK(done_splitting);
 }

std::vector< std::unique_ptr< ExecutionKernel > > Executor::createKernels	(	SharedKernelContext &	shared_context,
		const RelAlgExecutionUnit &	ra_exe_unit,
		ColumnFetcher &	column_fetcher,
		const std::vector< InputTableInfo > &	table_infos,
		const ExecutionOptions &	eo,
		const bool	is_agg,
		const bool	allow_single_frag_table_opt,
		const size_t	context_count,
		const QueryCompilationDescriptor &	query_comp_desc,
		const QueryMemoryDescriptor &	query_mem_desc,
		RenderInfo *	render_info,
		std::unordered_set< int > &	available_gpus,
		int &	available_cpus
	)

private

Determines execution dispatch mode and required fragments for a given query step, then creates kernels to execute the query and returns them for launch.

Definition at line 2907 of file Execute.cpp.

References ExecutionOptions::allow_multifrag, CHECK, CHECK_GE, CHECK_GT, anonymous_namespace{Execute.cpp}::checkWorkUnitWatchdog(), data_mgr_, deviceCount(), g_inner_join_fragment_skipping, getColLazyFetchInfo(), QueryCompilationDescriptor::getDeviceType(), QueryMemoryDescriptor::getEntryCount(), SharedKernelContext::getFragOffsets(), Data_Namespace::DataMgr::getMemoryInfo(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, ExecutionOptions::gpu_input_mem_limit_percent, Data_Namespace::GPU_LEVEL, anonymous_namespace{Execute.cpp}::has_lazy_fetched_columns(), logger::INFO, RelAlgExecutionUnit::input_descs, KernelPerFragment, LOG, MultifragmentKernel, ExecutionOptions::outer_fragment_indices, plan_state_, heavyai::Projection, query_mem_desc, RelAlgExecutionUnit::target_exprs, QueryMemoryDescriptor::toString(), RelAlgExecutionUnit::use_bump_allocator, VLOG, and ExecutionOptions::with_watchdog.

Referenced by executeWorkUnitImpl().

                          {
   std::vector<std::unique_ptr<ExecutionKernel>> execution_kernels;
 
   QueryFragmentDescriptor fragment_descriptor(
       ra_exe_unit,
       table_infos,
       query_comp_desc.getDeviceType() == ExecutorDeviceType::GPU
           ? data_mgr_->getMemoryInfo(Data_Namespace::MemoryLevel::GPU_LEVEL)
           : std::vector<Data_Namespace::MemoryInfo>{},
       eo.gpu_input_mem_limit_percent,
       eo.outer_fragment_indices);
   CHECK(!ra_exe_unit.input_descs.empty());
 
   const auto device_type = query_comp_desc.getDeviceType();
   const bool uses_lazy_fetch =
       plan_state_->allow_lazy_fetch_ &&
       has_lazy_fetched_columns(getColLazyFetchInfo(ra_exe_unit.target_exprs));
   const bool use_multifrag_kernel = (device_type == ExecutorDeviceType::GPU) &&
                                     eo.allow_multifrag && (!uses_lazy_fetch || is_agg);
   const auto device_count = deviceCount(device_type);
   CHECK_GT(device_count, 0);
 
   fragment_descriptor.buildFragmentKernelMap(ra_exe_unit,
                                              shared_context.getFragOffsets(),
                                              device_count,
                                              device_type,
                                              use_multifrag_kernel,
                                              g_inner_join_fragment_skipping,
                                              this);
   if (eo.with_watchdog && fragment_descriptor.shouldCheckWorkUnitWatchdog()) {
     checkWorkUnitWatchdog(ra_exe_unit, table_infos, device_type, device_count);
   }
 
   if (use_multifrag_kernel) {
     VLOG(1) << "Creating multifrag execution kernels";
     VLOG(1) << query_mem_desc.toString();
 
     // NB: We should never be on this path when the query is retried because of running
     // out of group by slots; also, for scan only queries on CPU we want the
     // high-granularity, fragment by fragment execution instead. For scan only queries on
     // GPU, we want the multifrag kernel path to save the overhead of allocating an output
     // buffer per fragment.
     auto multifrag_kernel_dispatch = [&ra_exe_unit,
                                       &execution_kernels,
                                       &column_fetcher,
                                       &eo,
                                       &query_comp_desc,
                                       &query_mem_desc,
                                       render_info](const int device_id,
                                                    const FragmentsList& frag_list,
                                                    const int64_t rowid_lookup_key) {
       execution_kernels.emplace_back(
           std::make_unique<ExecutionKernel>(ra_exe_unit,
                                             ExecutorDeviceType::GPU,
                                             device_id,
                                             eo,
                                             column_fetcher,
                                             query_comp_desc,
                                             query_mem_desc,
                                             frag_list,
                                             ExecutorDispatchMode::MultifragmentKernel,
                                             render_info,
                                             rowid_lookup_key));
     };
     fragment_descriptor.assignFragsToMultiDispatch(multifrag_kernel_dispatch);
   } else {
     VLOG(1) << "Creating one execution kernel per fragment";
     VLOG(1) << query_mem_desc.toString();
 
     if (!ra_exe_unit.use_bump_allocator && allow_single_frag_table_opt &&
         (query_mem_desc.getQueryDescriptionType() == QueryDescriptionType::Projection) &&
         table_infos.size() == 1 && table_infos.front().table_key.table_id > 0) {
       const auto max_frag_size =
           table_infos.front().info.getFragmentNumTuplesUpperBound();
       if (max_frag_size < query_mem_desc.getEntryCount()) {
         LOG(INFO) << "Lowering scan limit from " << query_mem_desc.getEntryCount()
                   << " to match max fragment size " << max_frag_size
                   << " for kernel per fragment execution path.";
         throw CompilationRetryNewScanLimit(max_frag_size);
       }
     }
 
     size_t frag_list_idx{0};
     auto fragment_per_kernel_dispatch = [&ra_exe_unit,
                                          &execution_kernels,
                                          &column_fetcher,
                                          &eo,
                                          &frag_list_idx,
                                          &device_type,
                                          &query_comp_desc,
                                          &query_mem_desc,
                                          render_info](const int device_id,
                                                       const FragmentsList& frag_list,
                                                       const int64_t rowid_lookup_key) {
       if (!frag_list.size()) {
         return;
       }
       CHECK_GE(device_id, 0);
 
       execution_kernels.emplace_back(
           std::make_unique<ExecutionKernel>(ra_exe_unit,
                                             device_type,
                                             device_id,
                                             eo,
                                             column_fetcher,
                                             query_comp_desc,
                                             query_mem_desc,
                                             frag_list,
                                             ExecutorDispatchMode::KernelPerFragment,
                                             render_info,
                                             rowid_lookup_key));
       ++frag_list_idx;
     };
 
     fragment_descriptor.assignFragsToKernelDispatch(fragment_per_kernel_dispatch,
                                                     ra_exe_unit);
   }
   return execution_kernels;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

CudaMgr_Namespace::CudaMgr* Executor::cudaMgr ( ) const

inlineprivate

Definition at line 865 of file Execute.h.

References CHECK, data_mgr_, and Data_Namespace::DataMgr::getCudaMgr().

Referenced by deviceCount(), deviceCycles(), isArchPascalOrLater(), numBlocksPerMP(), and warpSize().

                                             {
     CHECK(data_mgr_);
     auto cuda_mgr = data_mgr_->getCudaMgr();
     CHECK(cuda_mgr);
     return cuda_mgr;
   }

Here is the call graph for this function:

Here is the caller graph for this function:

int Executor::deviceCount ( const ExecutorDeviceType device_type ) const

Definition at line 1322 of file Execute.cpp.

References cudaMgr(), CudaMgr_Namespace::CudaMgr::getDeviceCount(), and GPU.

Referenced by createKernels(), and deviceCountForMemoryLevel().

                                                                     {
   if (device_type == ExecutorDeviceType::GPU) {
     return cudaMgr()->getDeviceCount();
   } else {
     return 1;
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

int Executor::deviceCountForMemoryLevel ( const Data_Namespace::MemoryLevel memory_level ) const

private

Definition at line 1330 of file Execute.cpp.

References CPU, deviceCount(), GPU, and Data_Namespace::GPU_LEVEL.

Referenced by buildHashTableForQualifier().

                                                         {
   return memory_level == GPU_LEVEL ? deviceCount(ExecutorDeviceType::GPU)
                                    : deviceCount(ExecutorDeviceType::CPU);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

int64_t Executor::deviceCycles ( int milliseconds ) const

private

Definition at line 4396 of file Execute.cpp.

References cudaMgr(), and CudaMgr_Namespace::CudaMgr::getAllDeviceProperties().

                                                      {
   const auto& dev_props = cudaMgr()->getAllDeviceProperties();
   return static_cast<int64_t>(dev_props.front().clockKhz) * milliseconds;
 }

Here is the call graph for this function:

std::string Executor::dumpCache ( ) const

Definition at line 5520 of file Execute.cpp.

References agg_col_range_cache_, TableGenerations::asMap(), AggregatedColRange::asMap(), row_set_mem_owner_, and table_generations_.

                                     {
   std::stringstream ss;
   ss << "colRangeCache: ";
   for (auto& [phys_input, exp_range] : agg_col_range_cache_.asMap()) {
     ss << "{" << phys_input.col_id << ", " << phys_input.table_id
        << "} = " << exp_range.toString() << ", ";
   }
   ss << "stringDictGenerations: ";
   for (auto& [key, val] : row_set_mem_owner_->getStringDictionaryGenerations().asMap()) {
     ss << key << " = " << val << ", ";
   }
   ss << "tableGenerations: ";
   for (auto& [key, val] : table_generations_.asMap()) {
     ss << key << " = {" << val.tuple_count << ", " << val.start_rowid << "}, ";
   }
   ss << "\n";
   return ss.str();
 }

Here is the call graph for this function:

void Executor::enableRuntimeQueryInterrupt	(	const double	runtime_query_check_freq,
		const unsigned	pending_query_check_freq
	)		const

Definition at line 5274 of file Execute.cpp.

References g_enable_runtime_query_interrupt, g_pending_query_interrupt_freq, and g_running_query_interrupt_freq.

                                                    {
   // The only one scenario that we intentionally call this function is
   // to allow runtime query interrupt in QueryRunner for test cases.
   // Because test machine's default setting does not allow runtime query interrupt,
   // so we have to turn it on within test code if necessary.
   g_enable_runtime_query_interrupt = true;
   g_pending_query_interrupt_freq = pending_query_check_freq;
   g_running_query_interrupt_freq = runtime_query_check_freq;
   if (g_running_query_interrupt_freq) {
     g_running_query_interrupt_freq = 0.5;
   }
 }

void Executor::enrollQuerySession	(	const QuerySessionId &	query_session,
		const std::string &	query_str,
		const std::string &	submitted_time_str,
		const size_t	executor_id,
		const QuerySessionStatus::QueryStatus	query_session_status
	)

Definition at line 5091 of file Execute.cpp.

References addToQuerySessionList(), current_query_session_, and executor_session_mutex_.

                                                               {
   // enroll the query session into the Executor's session map
   heavyai::unique_lock<heavyai::shared_mutex> session_write_lock(executor_session_mutex_);
   if (query_session.empty()) {
     return;
   }
 
   addToQuerySessionList(query_session,
                         query_str,
                         submitted_time_str,
                         executor_id,
                         query_session_status,
                         session_write_lock);
 
   if (query_session_status == QuerySessionStatus::QueryStatus::RUNNING_QUERY_KERNEL) {
     current_query_session_ = query_session;
   }
 }

Here is the call graph for this function:

ResultSetPtr Executor::executeExplain ( const QueryCompilationDescriptor & query_comp_desc )

private

Definition at line 2519 of file Execute.cpp.

References QueryCompilationDescriptor::getIR().

Referenced by executeWorkUnitImpl().

                                                                                        {
   return std::make_shared<ResultSet>(query_comp_desc.getIR());
 }

Here is the call graph for this function:

Here is the caller graph for this function:

int32_t Executor::executePlanWithGroupBy	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const CompilationResult &	compilation_result,
		const bool	hoist_literals,
		ResultSetPtr *	results,
		const ExecutorDeviceType	device_type,
		std::vector< std::vector< const int8_t * >> &	col_buffers,
		const std::vector< size_t >	outer_tab_frag_ids,
		QueryExecutionContext *	query_exe_context,
		const std::vector< std::vector< int64_t >> &	num_rows,
		const std::vector< std::vector< uint64_t >> &	frag_offsets,
		Data_Namespace::DataMgr *	data_mgr,
		const int	device_id,
		const shared::TableKey &	outer_table_key,
		const int64_t	limit,
		const uint32_t	start_rowid,
		const uint32_t	num_tables,
		const bool	allow_runtime_interrupt,
		RenderInfo *	render_info,
		const bool	optimize_cuda_block_and_grid_sizes,
		const int64_t	rows_to_process = `-1`
	)

private

Definition at line 4061 of file Execute.cpp.

References anonymous_namespace{Utm.h}::a, blockSize(), CHECK, CHECK_NE, anonymous_namespace{Execute.cpp}::check_rows_less_than_needed(), checkIsQuerySessionInterrupted(), CPU, DEBUG_TIMER, report::error_code(), executor_session_mutex_, logger::FATAL, g_enable_dynamic_watchdog, CompilationResult::generated_code, getCurrentQuerySession(), QueryMemoryDescriptor::getEntryCount(), getJoinHashTablePtrs(), QueryExecutionContext::getRowSet(), GpuSharedMemoryContext::getSharedMemorySize(), CompilationResult::gpu_smem_context, gridSize(), RelAlgExecutionUnit::groupby_exprs, INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, interrupted_, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, shared::printContainer(), QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, RenderInfo::render_allocator_map_ptr, RelAlgExecutionUnit::scan_limit, serializeLiterals(), QueryMemoryDescriptor::setEntryCount(), RelAlgExecutionUnit::union_all, RenderInfo::useCudaBuffers(), and VLOG.

                                    {
   auto timer = DEBUG_TIMER(__func__);
   INJECT_TIMER(executePlanWithGroupBy);
   // TODO: get results via a separate method, but need to do something with literals.
   CHECK(!results || !(*results));
   if (col_buffers.empty()) {
     return 0;
   }
   CHECK_NE(ra_exe_unit.groupby_exprs.size(), size_t(0));
   // TODO(alex):
   // 1. Optimize size (make keys more compact).
   // 2. Resize on overflow.
   // 3. Optimize runtime.
   auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
   int32_t error_code = 0;
   const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
   if (allow_runtime_interrupt) {
     bool isInterrupted = false;
     {
       heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(
           executor_session_mutex_);
       const auto query_session = getCurrentQuerySession(session_read_lock);
       isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
     }
     if (isInterrupted) {
       throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
   }
   if (g_enable_dynamic_watchdog && interrupted_.load()) {
     return int32_t(ErrorCode::INTERRUPTED);
   }
 
   RenderAllocatorMap* render_allocator_map_ptr = nullptr;
   if (render_info && render_info->useCudaBuffers()) {
     render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
   }
 
   VLOG(2) << "bool(ra_exe_unit.union_all)=" << bool(ra_exe_unit.union_all)
           << " ra_exe_unit.input_descs="
           << shared::printContainer(ra_exe_unit.input_descs)
           << " ra_exe_unit.input_col_descs="
           << shared::printContainer(ra_exe_unit.input_col_descs)
           << " ra_exe_unit.scan_limit=" << ra_exe_unit.scan_limit
           << " num_rows=" << shared::printContainer(num_rows)
           << " frag_offsets=" << shared::printContainer(frag_offsets)
           << " query_exe_context->query_buffers_->num_rows_="
           << query_exe_context->query_buffers_->num_rows_
           << " query_exe_context->query_mem_desc_.getEntryCount()="
           << query_exe_context->query_mem_desc_.getEntryCount()
           << " device_id=" << device_id << " outer_table_key=" << outer_table_key
           << " scan_limit=" << scan_limit << " start_rowid=" << start_rowid
           << " num_tables=" << num_tables;
 
   RelAlgExecutionUnit ra_exe_unit_copy = ra_exe_unit;
   // For UNION ALL, filter out input_descs and input_col_descs that are not associated
   // with outer_table_id.
   if (ra_exe_unit_copy.union_all) {
     // Sort outer_table_id first, then pop the rest off of ra_exe_unit_copy.input_descs.
     std::stable_sort(ra_exe_unit_copy.input_descs.begin(),
                      ra_exe_unit_copy.input_descs.end(),
                      [outer_table_key](auto const& a, auto const& b) {
                        return a.getTableKey() == outer_table_key &&
                               b.getTableKey() != outer_table_key;
                      });
     while (!ra_exe_unit_copy.input_descs.empty() &&
            ra_exe_unit_copy.input_descs.back().getTableKey() != outer_table_key) {
       ra_exe_unit_copy.input_descs.pop_back();
     }
     // Filter ra_exe_unit_copy.input_col_descs.
     ra_exe_unit_copy.input_col_descs.remove_if(
         [outer_table_key](auto const& input_col_desc) {
           return input_col_desc->getScanDesc().getTableKey() != outer_table_key;
         });
     query_exe_context->query_mem_desc_.setEntryCount(ra_exe_unit_copy.scan_limit);
   }
 
   if (device_type == ExecutorDeviceType::CPU) {
     const int32_t scan_limit_for_query =
         ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
     const int32_t max_matched = scan_limit_for_query == 0
                                     ? query_exe_context->query_mem_desc_.getEntryCount()
                                     : scan_limit_for_query;
     CpuCompilationContext* cpu_generated_code =
         dynamic_cast<CpuCompilationContext*>(compilation_result.generated_code.get());
     CHECK(cpu_generated_code);
     query_exe_context->launchCpuCode(ra_exe_unit_copy,
                                      cpu_generated_code,
                                      hoist_literals,
                                      hoist_buf,
                                      col_buffers,
                                      num_rows,
                                      frag_offsets,
                                      max_matched,
                                      &error_code,
                                      start_rowid,
                                      num_tables,
                                      join_hash_table_ptrs,
                                      rows_to_process);
   } else {
     try {
       GpuCompilationContext* gpu_generated_code =
           dynamic_cast<GpuCompilationContext*>(compilation_result.generated_code.get());
       CHECK(gpu_generated_code);
       query_exe_context->launchGpuCode(
           ra_exe_unit_copy,
           gpu_generated_code,
           hoist_literals,
           hoist_buf,
           col_buffers,
           num_rows,
           frag_offsets,
           ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit,
           data_mgr,
           blockSize(),
           gridSize(),
           device_id,
           compilation_result.gpu_smem_context.getSharedMemorySize(),
           &error_code,
           num_tables,
           allow_runtime_interrupt,
           join_hash_table_ptrs,
           render_allocator_map_ptr,
           optimize_cuda_block_and_grid_sizes);
     } catch (const OutOfMemory&) {
       return int32_t(ErrorCode::OUT_OF_GPU_MEM);
     } catch (const OutOfRenderMemory&) {
       return int32_t(ErrorCode::OUT_OF_RENDER_MEM);
     } catch (const StreamingTopNNotSupportedInRenderQuery&) {
       return int32_t(ErrorCode::STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY);
     } catch (const std::exception& e) {
       LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
     }
   }
 
   if (heavyai::IsAny<ErrorCode::OVERFLOW_OR_UNDERFLOW,
                      ErrorCode::DIV_BY_ZERO,
                      ErrorCode::OUT_OF_TIME,
                      ErrorCode::INTERRUPTED,
                      ErrorCode::SINGLE_VALUE_FOUND_MULTIPLE_VALUES,
                      ErrorCode::GEOS,
                      ErrorCode::WIDTH_BUCKET_INVALID_ARGUMENT,
                      ErrorCode::BBOX_OVERLAPS_LIMIT_EXCEEDED>::check(error_code)) {
     return error_code;
   }
 
   if (results && error_code != int32_t(ErrorCode::OVERFLOW_OR_UNDERFLOW) &&
       error_code != int32_t(ErrorCode::DIV_BY_ZERO) && !render_allocator_map_ptr) {
     *results = query_exe_context->getRowSet(ra_exe_unit_copy,
                                             query_exe_context->query_mem_desc_);
     CHECK(*results);
     VLOG(2) << "results->rowCount()=" << (*results)->rowCount();
     (*results)->holdLiterals(hoist_buf);
   }
   if (error_code < 0 && render_allocator_map_ptr) {
     auto const adjusted_scan_limit =
         ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
     // More rows passed the filter than available slots. We don't have a count to check,
     // so assume we met the limit if a scan limit is set
     if (adjusted_scan_limit != 0) {
       return 0;
     } else {
       return error_code;
     }
   }
   if (results && error_code &&
       (!scan_limit || check_rows_less_than_needed(*results, scan_limit))) {
     return error_code;  // unlucky, not enough results and we ran out of slots
   }
 
   return 0;
 }

Here is the call graph for this function:

int32_t Executor::executePlanWithoutGroupBy	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const CompilationResult &	compilation_result,
		const bool	hoist_literals,
		ResultSetPtr *	results,
		const std::vector< Analyzer::Expr * > &	target_exprs,
		const ExecutorDeviceType	device_type,
		std::vector< std::vector< const int8_t * >> &	col_buffers,
		QueryExecutionContext *	query_exe_context,
		const std::vector< std::vector< int64_t >> &	num_rows,
		const std::vector< std::vector< uint64_t >> &	frag_offsets,
		Data_Namespace::DataMgr *	data_mgr,
		const int	device_id,
		const uint32_t	start_rowid,
		const uint32_t	num_tables,
		const bool	allow_runtime_interrupt,
		RenderInfo *	render_info,
		const bool	optimize_cuda_block_and_grid_sizes,
		const int64_t	rows_to_process = `-1`
	)

private

Definition at line 3834 of file Execute.cpp.

References blockSize(), CHECK, CHECK_EQ, checkIsQuerySessionInterrupted(), CPU, DEBUG_TIMER, report::error_code(), RelAlgExecutionUnit::estimator, QueryExecutionContext::estimator_result_set_, executor_session_mutex_, logger::FATAL, g_bigint_count, g_enable_dynamic_watchdog, CompilationResult::generated_code, get_target_info(), QueryExecutionContext::getAggInitValForIndex(), getCurrentQuerySession(), getJoinHashTablePtrs(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, gridSize(), INJECT_TIMER, interrupted_, is_distinct_target(), heavyai::InSituFlagsOwnerInterface::isInSitu(), GpuSharedMemoryContext::isSharedMemoryUsed(), kAVG, kCOUNT, kSAMPLE, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, reduceResults(), RenderInfo::render_allocator_map_ptr, serializeLiterals(), takes_float_argument(), and RenderInfo::useCudaBuffers().

                                    {
   INJECT_TIMER(executePlanWithoutGroupBy);
   auto timer = DEBUG_TIMER(__func__);
   CHECK(!results || !(*results));
   if (col_buffers.empty()) {
     return 0;
   }
 
   RenderAllocatorMap* render_allocator_map_ptr = nullptr;
   if (render_info) {
     // TODO(adb): make sure that we either never get here in the CPU case, or if we do get
     // here, we are in non-insitu mode.
     CHECK(render_info->useCudaBuffers() || !render_info->isInSitu())
         << "CUDA disabled rendering in the executePlanWithoutGroupBy query path is "
            "currently unsupported.";
     render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
   }
 
   int32_t error_code = 0;
   std::vector<int64_t*> out_vec;
   const auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
   const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
   std::unique_ptr<OutVecOwner> output_memory_scope;
   if (allow_runtime_interrupt) {
     bool isInterrupted = false;
     {
       heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(
           executor_session_mutex_);
       const auto query_session = getCurrentQuerySession(session_read_lock);
       isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
     }
     if (isInterrupted) {
       throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
   }
   if (g_enable_dynamic_watchdog && interrupted_.load()) {
     throw QueryExecutionError(ErrorCode::INTERRUPTED);
   }
   if (device_type == ExecutorDeviceType::CPU) {
     CpuCompilationContext* cpu_generated_code =
         dynamic_cast<CpuCompilationContext*>(compilation_result.generated_code.get());
     CHECK(cpu_generated_code);
     out_vec = query_exe_context->launchCpuCode(ra_exe_unit,
                                                cpu_generated_code,
                                                hoist_literals,
                                                hoist_buf,
                                                col_buffers,
                                                num_rows,
                                                frag_offsets,
                                                0,
                                                &error_code,
                                                start_rowid,
                                                num_tables,
                                                join_hash_table_ptrs,
                                                rows_to_process);
     output_memory_scope.reset(new OutVecOwner(out_vec));
   } else {
     GpuCompilationContext* gpu_generated_code =
         dynamic_cast<GpuCompilationContext*>(compilation_result.generated_code.get());
     CHECK(gpu_generated_code);
     try {
       out_vec = query_exe_context->launchGpuCode(
           ra_exe_unit,
           gpu_generated_code,
           hoist_literals,
           hoist_buf,
           col_buffers,
           num_rows,
           frag_offsets,
           0,
           data_mgr,
           blockSize(),
           gridSize(),
           device_id,
           compilation_result.gpu_smem_context.getSharedMemorySize(),
           &error_code,
           num_tables,
           allow_runtime_interrupt,
           join_hash_table_ptrs,
           render_allocator_map_ptr,
           optimize_cuda_block_and_grid_sizes);
       output_memory_scope.reset(new OutVecOwner(out_vec));
     } catch (const OutOfMemory&) {
       return int32_t(ErrorCode::OUT_OF_GPU_MEM);
     } catch (const std::exception& e) {
       LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
     }
   }
   if (heavyai::IsAny<ErrorCode::OVERFLOW_OR_UNDERFLOW,
                      ErrorCode::DIV_BY_ZERO,
                      ErrorCode::OUT_OF_TIME,
                      ErrorCode::INTERRUPTED,
                      ErrorCode::SINGLE_VALUE_FOUND_MULTIPLE_VALUES,
                      ErrorCode::GEOS,
                      ErrorCode::WIDTH_BUCKET_INVALID_ARGUMENT,
                      ErrorCode::BBOX_OVERLAPS_LIMIT_EXCEEDED>::check(error_code)) {
     return error_code;
   }
   if (ra_exe_unit.estimator) {
     CHECK(!error_code);
     if (results) {
       *results =
           std::shared_ptr<ResultSet>(query_exe_context->estimator_result_set_.release());
     }
     return 0;
   }
   // Expect delayed results extraction (used for sub-fragments) for estimator only;
   CHECK(results);
   std::vector<int64_t> reduced_outs;
   const auto num_frags = col_buffers.size();
   const size_t entry_count =
       device_type == ExecutorDeviceType::GPU
           ? (compilation_result.gpu_smem_context.isSharedMemoryUsed()
                  ? 1
                  : blockSize() * gridSize() * num_frags)
           : num_frags;
   if (size_t(1) == entry_count) {
     for (auto out : out_vec) {
       CHECK(out);
       reduced_outs.push_back(*out);
     }
   } else {
     size_t out_vec_idx = 0;
 
     for (const auto target_expr : target_exprs) {
       const auto agg_info = get_target_info(target_expr, g_bigint_count);
       CHECK(agg_info.is_agg || dynamic_cast<Analyzer::Constant*>(target_expr))
           << target_expr->toString();
 
       const int num_iterations = agg_info.sql_type.is_geometry()
                                      ? agg_info.sql_type.get_physical_coord_cols()
                                      : 1;
 
       for (int i = 0; i < num_iterations; i++) {
         int64_t val1;
         const bool float_argument_input = takes_float_argument(agg_info);
         if (is_distinct_target(agg_info) ||
             shared::is_any<kAPPROX_QUANTILE, kMODE>(agg_info.agg_kind)) {
           bool const check = shared::
               is_any<kCOUNT, kAPPROX_COUNT_DISTINCT, kAPPROX_QUANTILE, kMODE, kCOUNT_IF>(
                   agg_info.agg_kind);
           CHECK(check) << agg_info.agg_kind;
           val1 = out_vec[out_vec_idx][0];
           error_code = 0;
         } else {
           const auto chosen_bytes = static_cast<size_t>(
               query_exe_context->query_mem_desc_.getPaddedSlotWidthBytes(out_vec_idx));
           std::tie(val1, error_code) = Executor::reduceResults(
               agg_info.agg_kind,
               agg_info.sql_type,
               query_exe_context->getAggInitValForIndex(out_vec_idx),
               float_argument_input ? sizeof(int32_t) : chosen_bytes,
               out_vec[out_vec_idx],
               entry_count,
               false,
               float_argument_input);
         }
         if (error_code) {
           break;
         }
         reduced_outs.push_back(val1);
         if (agg_info.agg_kind == kAVG ||
             (agg_info.agg_kind == kSAMPLE &&
              (agg_info.sql_type.is_varlen() || agg_info.sql_type.is_geometry()))) {
           const auto chosen_bytes = static_cast<size_t>(
               query_exe_context->query_mem_desc_.getPaddedSlotWidthBytes(out_vec_idx +
                                                                          1));
           int64_t val2;
           std::tie(val2, error_code) = Executor::reduceResults(
               agg_info.agg_kind == kAVG ? kCOUNT : agg_info.agg_kind,
               agg_info.sql_type,
               query_exe_context->getAggInitValForIndex(out_vec_idx + 1),
               float_argument_input ? sizeof(int32_t) : chosen_bytes,
               out_vec[out_vec_idx + 1],
               entry_count,
               false,
               false);
           if (error_code) {
             break;
           }
           reduced_outs.push_back(val2);
           ++out_vec_idx;
         }
         ++out_vec_idx;
       }
     }
   }
 
   if (error_code) {
     return error_code;
   }
 
   CHECK_EQ(size_t(1), query_exe_context->query_buffers_->result_sets_.size());
   auto rows_ptr = std::shared_ptr<ResultSet>(
       query_exe_context->query_buffers_->result_sets_[0].release());
   rows_ptr->fillOneEntry(reduced_outs);
   *results = std::move(rows_ptr);
   return error_code;
 }

Here is the call graph for this function:

ResultSetPtr Executor::executeTableFunction	(	const TableFunctionExecutionUnit	exe_unit,
		const std::vector< InputTableInfo > &	table_infos,
		const CompilationOptions &	co,
		const ExecutionOptions &	eo
	)

private

Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps.

Definition at line 2445 of file Execute.cpp.

References blockSize(), TableFunctionCompilationContext::compile(), table_functions::TableFunction::containsPreFlightFn(), CPU, CompilationOptions::device_type, TableFunctionExecutionContext::execute(), ResultSet::fixupQueryMemoryDescriptor(), getRowSetMemoryOwner(), GPU, gridSize(), table_functions::TableFunction::hasTableFunctionSpecifiedParameter(), INJECT_TIMER, ExecutionOptions::just_validate, CompilationOptions::makeCpuOnly(), query_mem_desc, TableFunctionExecutionUnit::table_func, heavyai::TableFunction, TableFunctionExecutionUnit::target_exprs, and target_exprs_to_infos().

                                 {
   INJECT_TIMER(Exec_executeTableFunction);
   if (eo.just_validate) {
     QueryMemoryDescriptor query_mem_desc(this,
                                          /*entry_count=*/0,
                                          QueryDescriptionType::TableFunction);
     return std::make_shared<ResultSet>(
         target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
         co.device_type,
         ResultSet::fixupQueryMemoryDescriptor(query_mem_desc),
         this->getRowSetMemoryOwner(),
         this->blockSize(),
         this->gridSize());
   }
 
   // Avoid compile functions that set the sizer at runtime if the device is GPU
   // This should be fixed in the python script as well to minimize the number of
   // QueryMustRunOnCpu exceptions
   if (co.device_type == ExecutorDeviceType::GPU &&
       exe_unit.table_func.hasTableFunctionSpecifiedParameter()) {
     throw QueryMustRunOnCpu();
   }
 
   ColumnCacheMap column_cache;  // Note: if we add retries to the table function
                                 // framework, we may want to move this up a level
 
   ColumnFetcher column_fetcher(this, column_cache);
   TableFunctionExecutionContext exe_context(getRowSetMemoryOwner());
 
   if (exe_unit.table_func.containsPreFlightFn()) {
     std::shared_ptr<CompilationContext> compilation_context;
     {
       Executor::CgenStateManager cgenstate_manager(*this,
                                                    false,
                                                    table_infos,
                                                    PlanState::DeletedColumnsMap{},
                                                    nullptr);  // locks compilation_mutex
       CompilationOptions pre_flight_co = CompilationOptions::makeCpuOnly(co);
       TableFunctionCompilationContext tf_compilation_context(this, pre_flight_co);
       compilation_context =
           tf_compilation_context.compile(exe_unit, true /* emit_only_preflight_fn*/);
     }
     exe_context.execute(exe_unit,
                         table_infos,
                         compilation_context,
                         column_fetcher,
                         ExecutorDeviceType::CPU,
                         this,
                         true /* is_pre_launch_udtf */);
   }
   std::shared_ptr<CompilationContext> compilation_context;
   {
     Executor::CgenStateManager cgenstate_manager(*this,
                                                  false,
                                                  table_infos,
                                                  PlanState::DeletedColumnsMap{},
                                                  nullptr);  // locks compilation_mutex
     TableFunctionCompilationContext tf_compilation_context(this, co);
     compilation_context =
         tf_compilation_context.compile(exe_unit, false /* emit_only_preflight_fn */);
   }
   return exe_context.execute(exe_unit,
                              table_infos,
                              compilation_context,
                              column_fetcher,
                              co.device_type,
                              this,
                              false /* is_pre_launch_udtf */);
 }

Here is the call graph for this function:

TableUpdateMetadata Executor::executeUpdate	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const std::vector< InputTableInfo > &	table_infos,
		const TableDescriptor *	updated_table_desc,
		const CompilationOptions &	co,
		const ExecutionOptions &	eo,
		const Catalog_Namespace::Catalog &	cat,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		const UpdateLogForFragment::Callback &	cb,
		const bool	is_agg
	)

Definition at line 65 of file ExecuteUpdate.cpp.

References anonymous_namespace{Utm.h}::a, CHECK, CHECK_EQ, CHECK_GT, CPU, FragmentsPerTable::fragment_ids, g_enable_auto_metadata_update, SharedKernelContext::getFragmentResults(), SharedKernelContext::getFragOffsets(), Catalog_Namespace::Catalog::getMetadataForTable(), KernelPerFragment, query_mem_desc, ExecutionKernel::run(), TableDescriptor::tableId, timer_start(), timer_stop(), and VLOG.

                        {
   CHECK(cb);
   CHECK(table_desc_for_update);
   VLOG(1) << "Executor " << executor_id_
           << " is executing update/delete work unit:" << ra_exe_unit_in;
 
   const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
   ColumnCacheMap column_cache;
 
   ColumnFetcher column_fetcher(this, column_cache);
   CHECK_GT(ra_exe_unit.input_descs.size(), size_t(0));
   const auto& outer_table_key = ra_exe_unit.input_descs[0].getTableKey();
   CHECK_EQ(outer_table_key, table_infos.front().table_key);
   const auto& outer_fragments = table_infos.front().info.fragments;
 
   std::vector<FragmentsPerTable> fragments = {{{0, 0}, {0}}};
   for (size_t tab_idx = 1; tab_idx < ra_exe_unit.input_descs.size(); tab_idx++) {
     const auto& table_key = ra_exe_unit.input_descs[tab_idx].getTableKey();
     CHECK_EQ(table_infos[tab_idx].table_key, table_key);
     const auto& fragmentsPerTable = table_infos[tab_idx].info.fragments;
     FragmentsPerTable entry = {table_key, {}};
     for (size_t innerFragId = 0; innerFragId < fragmentsPerTable.size(); innerFragId++) {
       entry.fragment_ids.push_back(innerFragId);
     }
     fragments.push_back(entry);
   }
 
   if (outer_fragments.empty()) {
     return {};
   }
 
   const auto max_tuple_count_fragment_it = std::max_element(
       outer_fragments.begin(), outer_fragments.end(), [](const auto& a, const auto& b) {
         return a.getNumTuples() < b.getNumTuples();
       });
   CHECK(max_tuple_count_fragment_it != outer_fragments.end());
   int64_t global_max_groups_buffer_entry_guess =
       max_tuple_count_fragment_it->getNumTuples();
   if (is_agg) {
     global_max_groups_buffer_entry_guess = std::min(
         2 * global_max_groups_buffer_entry_guess, static_cast<int64_t>(100'000'000));
   }
 
   auto query_comp_desc = std::make_unique<QueryCompilationDescriptor>();
   std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
   {
     query_mem_desc = query_comp_desc->compile(global_max_groups_buffer_entry_guess,
                                               8,
                                               /*has_cardinality_estimation=*/true,
                                               ra_exe_unit,
                                               table_infos,
                                               deleted_cols_map,
                                               column_fetcher,
                                               co,
                                               eo,
                                               nullptr,
                                               this);
   }
   CHECK(query_mem_desc);
   // Since we execute updates one thread/fragment at a time,
   // buffer re-use is not applicable and can cause issues
   // when the contents of the output buffer are written to storage
   query_mem_desc->setThreadsCanReuseGroupByBuffers(false);
 
   TableUpdateMetadata table_update_metadata;
   for (size_t fragment_index = 0; fragment_index < outer_fragments.size();
        ++fragment_index) {
     const int64_t crt_fragment_tuple_count =
         outer_fragments[fragment_index].getNumTuples();
     if (crt_fragment_tuple_count == 0) {
       // nothing to update
       continue;
     }
     SharedKernelContext shared_context(table_infos);
     const auto& frag_offsets = shared_context.getFragOffsets();
     auto skip_frag = skipFragment(ra_exe_unit.input_descs[0],
                                   outer_fragments[fragment_index],
                                   ra_exe_unit.simple_quals,
                                   frag_offsets,
                                   fragment_index);
     if (skip_frag.first) {
       VLOG(2) << "Update/delete skipping fragment with table id: "
               << outer_fragments[fragment_index].physicalTableId
               << ", fragment id: " << fragment_index;
       continue;
     }
     fragments[0] = {outer_table_key, {fragment_index}};
     {
       ExecutionKernel current_fragment_kernel(ra_exe_unit,
                                               ExecutorDeviceType::CPU,
                                               0,
                                               eo,
                                               column_fetcher,
                                               *query_comp_desc,
                                               *query_mem_desc,
                                               fragments,
                                               ExecutorDispatchMode::KernelPerFragment,
                                               /*render_info=*/nullptr,
                                               /*rowid_lookup_key=*/-1);
 
       auto clock_begin = timer_start();
       std::lock_guard<std::mutex> kernel_lock(kernel_mutex_);
       kernel_queue_time_ms_ += timer_stop(clock_begin);
 
       current_fragment_kernel.run(this, 0, shared_context);
     }
     const auto& proj_fragment_results = shared_context.getFragmentResults();
     if (proj_fragment_results.empty()) {
       continue;
     }
     const auto& proj_fragment_result = proj_fragment_results[0];
     const auto proj_result_set = proj_fragment_result.first;
     CHECK(proj_result_set);
     cb({outer_fragments[fragment_index], fragment_index, proj_result_set},
        table_update_metadata);
   }
 
   if (g_enable_auto_metadata_update) {
     auto td = cat.getMetadataForTable(table_desc_for_update->tableId);
     TableOptimizer table_optimizer{td, this, cat};
     table_optimizer.recomputeMetadataUnlocked(table_update_metadata);
   }
   return table_update_metadata;
 }

Here is the call graph for this function:

ResultSetPtr Executor::executeWorkUnit	(	size_t &	max_groups_buffer_entry_guess,
		const bool	is_agg,
		const std::vector< InputTableInfo > &	query_infos,
		const RelAlgExecutionUnit &	ra_exe_unit_in,
		const CompilationOptions &	co,
		const ExecutionOptions &	options,
		RenderInfo *	render_info,
		const bool	has_cardinality_estimation,
		ColumnCacheMap &	column_cache
	)

Definition at line 2099 of file Execute.cpp.

References cgen_state_, compilation_queue_time_ms_, executeWorkUnitImpl(), executor_id_, ExecutionOptions::just_validate, kernel_queue_time_ms_, CompilationRetryNewScanLimit::new_scan_limit_, plan_state_, anonymous_namespace{Execute.cpp}::replace_scan_limit(), run_benchmark_import::result, row_set_mem_owner_, and VLOG.

                                                                      {
   VLOG(1) << "Executor " << executor_id_ << " is executing work unit:" << ra_exe_unit_in;
   ScopeGuard cleanup_post_execution = [this] {
     // cleanup/unpin GPU buffer allocations
     // TODO: separate out this state into a single object
     VLOG(1) << "Perform post execution clearance for Executor " << executor_id_;
     plan_state_.reset(nullptr);
     if (cgen_state_) {
       cgen_state_->in_values_bitmaps_.clear();
       cgen_state_->str_dict_translation_mgrs_.clear();
       cgen_state_->tree_model_prediction_mgrs_.clear();
     }
     row_set_mem_owner_->clearNonOwnedGroupByBuffers();
   };
 
   try {
     auto result = executeWorkUnitImpl(max_groups_buffer_entry_guess,
                                       is_agg,
                                       true,
                                       query_infos,
                                       ra_exe_unit_in,
                                       co,
                                       eo,
                                       row_set_mem_owner_,
                                       render_info,
                                       has_cardinality_estimation,
                                       column_cache);
     if (result) {
       result->setKernelQueueTime(kernel_queue_time_ms_);
       result->addCompilationQueueTime(compilation_queue_time_ms_);
       if (eo.just_validate) {
         result->setValidationOnlyRes();
       }
     }
     return result;
   } catch (const CompilationRetryNewScanLimit& e) {
     auto result =
         executeWorkUnitImpl(max_groups_buffer_entry_guess,
                             is_agg,
                             false,
                             query_infos,
                             replace_scan_limit(ra_exe_unit_in, e.new_scan_limit_),
                             co,
                             eo,
                             row_set_mem_owner_,
                             render_info,
                             has_cardinality_estimation,
                             column_cache);
     if (result) {
       result->setKernelQueueTime(kernel_queue_time_ms_);
       result->addCompilationQueueTime(compilation_queue_time_ms_);
       if (eo.just_validate) {
         result->setValidationOnlyRes();
       }
     }
     return result;
   }
 }

Here is the call graph for this function:

ResultSetPtr Executor::executeWorkUnitImpl	(	size_t &	max_groups_buffer_entry_guess,
		const bool	is_agg,
		const bool	allow_single_frag_table_opt,
		const std::vector< InputTableInfo > &	query_infos,
		const RelAlgExecutionUnit &	ra_exe_unit_in,
		const CompilationOptions &	co,
		const ExecutionOptions &	options,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		RenderInfo *	render_info,
		const bool	has_cardinality_estimation,
		ColumnCacheMap &	column_cache
	)

private

Definition at line 2166 of file Execute.cpp.

References addDeletedColumn(), ExecutionOptions::allow_runtime_query_interrupt, blockSize(), CHECK, CHECK_EQ, checkIsQuerySessionEnrolled(), collectAllDeviceResults(), anonymous_namespace{Execute.cpp}::compute_buffer_entry_guess(), CPU, cpu_threads(), createKernels(), data_mgr_, CompilationOptions::device_type, ExecutionOptions::estimate_output_cardinality, executeExplain(), executor_session_mutex_, ExecutionOptions::executor_type, ColumnFetcher::freeLinearizedBuf(), ColumnFetcher::freeTemporaryCpuLinearizedIdxBuf(), g_enable_executor_resource_mgr, get_available_gpus(), get_context_count(), getCurrentQuerySession(), getDeviceTypeForTargets(), QueryExecutionError::getErrorCode(), SharedKernelContext::getFragmentResults(), gridSize(), QueryExecutionError::hasErrorCode(), INJECT_TIMER, interrupted_, ExecutionOptions::just_explain, ExecutionOptions::just_validate, launchKernelsLocked(), launchKernelsViaResourceMgr(), MAX_BYTE_WIDTH_SUPPORTED, Native, RelAlgExecutionUnit::per_device_cardinality, plan_state_, heavyai::Projection, QueryMemoryDescriptor, run_benchmark_import::result, resultsUnion(), row_set_mem_owner_, QuerySessionStatus::RUNNING_REDUCTION, toString(), updateQuerySessionStatus(), VLOG, and ExecutionOptions::with_dynamic_watchdog.

Referenced by executeWorkUnit().

                                   {
   INJECT_TIMER(Exec_executeWorkUnit);
   const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
   const auto device_type = getDeviceTypeForTargets(ra_exe_unit, co.device_type);
   CHECK(!query_infos.empty());
   if (!max_groups_buffer_entry_guess) {
     // The query has failed the first execution attempt because of running out
     // of group by slots. Make the conservative choice: allocate fragment size
     // slots and run on the CPU.
     CHECK(device_type == ExecutorDeviceType::CPU);
     max_groups_buffer_entry_guess =
         compute_buffer_entry_guess(query_infos, ra_exe_unit_in);
   }
 
   int8_t crt_min_byte_width{MAX_BYTE_WIDTH_SUPPORTED};
   CompilationOptions copied_co = co;
   copied_co.device_type = device_type;
   do {
     SharedKernelContext shared_context(query_infos);
     ColumnFetcher column_fetcher(this, column_cache);
     ScopeGuard scope_guard = [&column_fetcher] {
       column_fetcher.freeLinearizedBuf();
       column_fetcher.freeTemporaryCpuLinearizedIdxBuf();
     };
     auto query_comp_desc_owned = std::make_unique<QueryCompilationDescriptor>();
     std::unique_ptr<QueryMemoryDescriptor> query_mem_desc_owned;
     if (eo.executor_type == ExecutorType::Native) {
       try {
         INJECT_TIMER(query_step_compilation);
         query_mem_desc_owned =
             query_comp_desc_owned->compile(max_groups_buffer_entry_guess,
                                            crt_min_byte_width,
                                            has_cardinality_estimation,
                                            ra_exe_unit,
                                            query_infos,
                                            deleted_cols_map,
                                            column_fetcher,
                                            copied_co,
                                            eo,
                                            render_info,
                                            this);
         CHECK(query_mem_desc_owned);
         crt_min_byte_width = query_comp_desc_owned->getMinByteWidth();
       } catch (CompilationRetryNoCompaction& e) {
         VLOG(1) << e.what();
         crt_min_byte_width = MAX_BYTE_WIDTH_SUPPORTED;
         continue;
       }
     } else {
       plan_state_.reset(new PlanState(false, query_infos, deleted_cols_map, this));
       plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
       CHECK(!query_mem_desc_owned);
       query_mem_desc_owned.reset(
           new QueryMemoryDescriptor(this, 0, QueryDescriptionType::Projection));
     }
     if (eo.just_explain) {
       return executeExplain(*query_comp_desc_owned);
     }
 
     if (query_mem_desc_owned->canUsePerDeviceCardinality(ra_exe_unit)) {
       auto const max_rows_per_device =
           query_mem_desc_owned->getMaxPerDeviceCardinality(ra_exe_unit);
       if (max_rows_per_device && *max_rows_per_device >= 0 &&
           *max_rows_per_device < query_mem_desc_owned->getEntryCount()) {
         VLOG(1) << "Setting the max per device cardinality of {max_rows_per_device} as "
                    "the new scan limit: "
                 << *max_rows_per_device;
         throw CompilationRetryNewScanLimit(*max_rows_per_device);
       }
     }
 
     if (!eo.just_validate) {
       int available_cpus = cpu_threads();
       auto available_gpus = get_available_gpus(data_mgr_);
 
       const auto context_count =
           get_context_count(device_type, available_cpus, available_gpus.size());
       try {
         auto kernels = createKernels(shared_context,
                                      ra_exe_unit,
                                      column_fetcher,
                                      query_infos,
                                      eo,
                                      is_agg,
                                      allow_single_frag_table_opt,
                                      context_count,
                                      *query_comp_desc_owned,
                                      *query_mem_desc_owned,
                                      render_info,
                                      available_gpus,
                                      available_cpus);
         if (!kernels.empty()) {
           row_set_mem_owner_->setKernelMemoryAllocator(kernels.size());
         }
         if (g_enable_executor_resource_mgr) {
           launchKernelsViaResourceMgr(shared_context,
                                       std::move(kernels),
                                       query_comp_desc_owned->getDeviceType(),
                                       ra_exe_unit.input_descs,
                                       *query_mem_desc_owned);
         } else {
           launchKernelsLocked(
               shared_context, std::move(kernels), query_comp_desc_owned->getDeviceType());
         }
 
       } catch (QueryExecutionError& e) {
         if (eo.with_dynamic_watchdog && interrupted_.load() &&
             e.hasErrorCode(ErrorCode::OUT_OF_TIME)) {
           throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
         if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
           throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
         if (e.hasErrorCode(ErrorCode::OVERFLOW_OR_UNDERFLOW) &&
             static_cast<size_t>(crt_min_byte_width << 1) <= sizeof(int64_t)) {
           crt_min_byte_width <<= 1;
           continue;
         }
         throw;
       }
     }
     if (is_agg) {
       if (eo.allow_runtime_query_interrupt && ra_exe_unit.query_state) {
         // update query status to let user know we are now in the reduction phase
         std::string curRunningSession{""};
         std::string curRunningQuerySubmittedTime{""};
         bool sessionEnrolled = false;
         {
           heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(
               executor_session_mutex_);
           curRunningSession = getCurrentQuerySession(session_read_lock);
           curRunningQuerySubmittedTime = ra_exe_unit.query_state->getQuerySubmittedTime();
           sessionEnrolled =
               checkIsQuerySessionEnrolled(curRunningSession, session_read_lock);
         }
         if (!curRunningSession.empty() && !curRunningQuerySubmittedTime.empty() &&
             sessionEnrolled) {
           updateQuerySessionStatus(curRunningSession,
                                    curRunningQuerySubmittedTime,
                                    QuerySessionStatus::RUNNING_REDUCTION);
         }
       }
       try {
         if (eo.estimate_output_cardinality) {
           for (const auto& result : shared_context.getFragmentResults()) {
             auto row = result.first->getNextRow(false, false);
             CHECK_EQ(1u, row.size());
             auto scalar_r = boost::get<ScalarTargetValue>(&row[0]);
             CHECK(scalar_r);
             auto p = boost::get<int64_t>(scalar_r);
             CHECK(p);
             // todo(yoonmin): sort the frag_ids to make it consistent for later usage
             auto frag_ids = result.second;
             VLOG(1) << "Filtered cardinality for fragments-{" << ::toString(result.second)
                     << "} : " << static_cast<size_t>(*p);
             ra_exe_unit_in.per_device_cardinality.emplace_back(result.second,
                                                                static_cast<size_t>(*p));
             result.first->moveToBegin();
           }
         }
         return collectAllDeviceResults(shared_context,
                                        ra_exe_unit,
                                        *query_mem_desc_owned,
                                        query_comp_desc_owned->getDeviceType(),
                                        row_set_mem_owner);
       } catch (ReductionRanOutOfSlots&) {
         throw QueryExecutionError(ErrorCode::OUT_OF_SLOTS);
       } catch (OverflowOrUnderflow&) {
         crt_min_byte_width <<= 1;
         continue;
       } catch (QueryExecutionError& e) {
         VLOG(1) << "Error received! error_code: " << e.getErrorCode()
                 << ", what(): " << e.what();
         throw QueryExecutionError(e.getErrorCode());
       }
     }
     return resultsUnion(shared_context, ra_exe_unit);
 
   } while (static_cast<size_t>(crt_min_byte_width) <= sizeof(int64_t));
 
   return std::make_shared<ResultSet>(std::vector<TargetInfo>{},
                                      ExecutorDeviceType::CPU,
                                      QueryMemoryDescriptor(),
                                      nullptr,
                                      blockSize(),
                                      gridSize());
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::executeWorkUnitPerFragment	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const InputTableInfo &	table_info,
		const CompilationOptions &	co,
		const ExecutionOptions &	eo,
		const Catalog_Namespace::Catalog &	cat,
		PerFragmentCallBack &	cb,
		const std::set< size_t > &	fragment_indexes_param
	)

private

Compiles and dispatches a work unit per fragment processing results with the per fragment callback. Currently used for computing metrics over fragments (metadata).

Definition at line 2365 of file Execute.cpp.

References addDeletedColumn(), CHECK, CHECK_EQ, CompilationOptions::device_type, Fragmenter_Namespace::TableInfo::fragments, SharedKernelContext::getFragmentResults(), InputTableInfo::info, kernel_mutex_, kernel_queue_time_ms_, KernelPerFragment, ExecutionKernel::run(), timer_start(), and timer_stop().

                                                   {
   const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
   ColumnCacheMap column_cache;
 
   std::vector<InputTableInfo> table_infos{table_info};
   SharedKernelContext kernel_context(table_infos);
 
   ColumnFetcher column_fetcher(this, column_cache);
   auto query_comp_desc_owned = std::make_unique<QueryCompilationDescriptor>();
   std::unique_ptr<QueryMemoryDescriptor> query_mem_desc_owned;
   {
     query_mem_desc_owned =
         query_comp_desc_owned->compile(0,
                                        8,
                                        /*has_cardinality_estimation=*/false,
                                        ra_exe_unit,
                                        table_infos,
                                        deleted_cols_map,
                                        column_fetcher,
                                        co,
                                        eo,
                                        nullptr,
                                        this);
   }
   CHECK(query_mem_desc_owned);
   CHECK_EQ(size_t(1), ra_exe_unit.input_descs.size());
   const auto table_key = ra_exe_unit.input_descs[0].getTableKey();
   const auto& outer_fragments = table_info.info.fragments;
 
   std::set<size_t> fragment_indexes;
   if (fragment_indexes_param.empty()) {
     // An empty `fragment_indexes_param` set implies executing
     // the query for all fragments in the table. In this
     // case, populate `fragment_indexes` with all fragment indexes.
     for (size_t i = 0; i < outer_fragments.size(); i++) {
       fragment_indexes.emplace(i);
     }
   } else {
     fragment_indexes = fragment_indexes_param;
   }
 
   {
     auto clock_begin = timer_start();
     std::lock_guard<std::mutex> kernel_lock(kernel_mutex_);
     kernel_queue_time_ms_ += timer_stop(clock_begin);
 
     for (auto fragment_index : fragment_indexes) {
       // We may want to consider in the future allowing this to execute on devices other
       // than CPU
       FragmentsList fragments_list{{table_key, {fragment_index}}};
       ExecutionKernel kernel(ra_exe_unit,
                              co.device_type,
                              /*device_id=*/0,
                              eo,
                              column_fetcher,
                              *query_comp_desc_owned,
                              *query_mem_desc_owned,
                              fragments_list,
                              ExecutorDispatchMode::KernelPerFragment,
                              /*render_info=*/nullptr,
                              /*rowid_lookup_key=*/-1);
       kernel.run(this, 0, kernel_context);
     }
   }
 
   const auto& all_fragment_results = kernel_context.getFragmentResults();
 
   for (const auto& [result_set_ptr, result_fragment_indexes] : all_fragment_results) {
     CHECK_EQ(result_fragment_indexes.size(), 1);
     cb(result_set_ptr, outer_fragments[result_fragment_indexes[0]]);
   }
 }

Here is the call graph for this function:

FetchResult Executor::fetchChunks	(	const ColumnFetcher &	column_fetcher,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const int	device_id,
		const Data_Namespace::MemoryLevel	memory_level,
		const std::map< shared::TableKey, const TableFragments * > &	all_tables_fragments,
		const FragmentsList &	selected_fragments,
		std::list< ChunkIter > &	chunk_iterators,
		std::list< std::shared_ptr< Chunk_NS::Chunk >> &	chunks,
		DeviceAllocator *	device_allocator,
		const size_t	thread_idx,
		const bool	allow_runtime_interrupt
	)

private

Definition at line 3458 of file Execute.cpp.

References buildSelectedFragsMapping(), CHECK, CHECK_EQ, CHECK_LT, checkIsQuerySessionInterrupted(), Data_Namespace::CPU_LEVEL, DEBUG_TIMER, executor_session_mutex_, g_enable_dynamic_watchdog, ColumnFetcher::getAllTableColumnFragments(), getCurrentQuerySession(), ColumnFetcher::getOneTableColumnFragment(), ColumnFetcher::getResultSetColumn(), getRowCountAndOffsetForAllFrags(), INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, interrupted_, ColumnFetcher::linearizeColumnFragments(), needFetchAllFragments(), needLinearizeAllFragments(), plan_state_, RESULT, anonymous_namespace{Execute.cpp}::try_get_column_descriptor(), and VLOG.

                                         {
   auto timer = DEBUG_TIMER(__func__);
   INJECT_TIMER(fetchChunks);
   const auto& col_global_ids = ra_exe_unit.input_col_descs;
   std::vector<std::vector<size_t>> selected_fragments_crossjoin;
   std::vector<size_t> local_col_to_frag_pos;
   buildSelectedFragsMapping(selected_fragments_crossjoin,
                             local_col_to_frag_pos,
                             col_global_ids,
                             selected_fragments,
                             ra_exe_unit);
 
   CartesianProduct<std::vector<std::vector<size_t>>> frag_ids_crossjoin(
       selected_fragments_crossjoin);
   std::vector<std::vector<const int8_t*>> all_frag_col_buffers;
   std::vector<std::vector<int64_t>> all_num_rows;
   std::vector<std::vector<uint64_t>> all_frag_offsets;
   for (const auto& selected_frag_ids : frag_ids_crossjoin) {
     std::vector<const int8_t*> frag_col_buffers(
         plan_state_->global_to_local_col_ids_.size());
     for (const auto& col_id : col_global_ids) {
       if (allow_runtime_interrupt) {
         bool isInterrupted = false;
         {
           heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(
               executor_session_mutex_);
           const auto query_session = getCurrentQuerySession(session_read_lock);
           isInterrupted =
               checkIsQuerySessionInterrupted(query_session, session_read_lock);
         }
         if (isInterrupted) {
           throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
       }
       if (g_enable_dynamic_watchdog && interrupted_.load()) {
         throw QueryExecutionError(ErrorCode::INTERRUPTED);
       }
       CHECK(col_id);
       const auto cd = try_get_column_descriptor(col_id.get());
       if (cd && cd->isVirtualCol) {
         CHECK_EQ("rowid", cd->columnName);
         continue;
       }
       const auto& table_key = col_id->getScanDesc().getTableKey();
       const auto fragments_it = all_tables_fragments.find(table_key);
       CHECK(fragments_it != all_tables_fragments.end());
       const auto fragments = fragments_it->second;
       auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
       CHECK(it != plan_state_->global_to_local_col_ids_.end());
       CHECK_LT(static_cast<size_t>(it->second),
                plan_state_->global_to_local_col_ids_.size());
       const size_t frag_id = selected_frag_ids[local_col_to_frag_pos[it->second]];
       if (!fragments->size()) {
         return {};
       }
       CHECK_LT(frag_id, fragments->size());
       auto memory_level_for_column = memory_level;
       const shared::ColumnKey tbl_col_key{col_id->getScanDesc().getTableKey(),
                                           col_id->getColId()};
       if (!plan_state_->isColumnToFetch(tbl_col_key)) {
         memory_level_for_column = Data_Namespace::CPU_LEVEL;
       }
       if (col_id->getScanDesc().getSourceType() == InputSourceType::RESULT) {
         frag_col_buffers[it->second] =
             column_fetcher.getResultSetColumn(col_id.get(),
                                               memory_level_for_column,
                                               device_id,
                                               device_allocator,
                                               thread_idx);
       } else {
         if (needFetchAllFragments(*col_id, ra_exe_unit, selected_fragments)) {
           // determine if we need special treatment to linearlize multi-frag table
           // i.e., a column that is classified as varlen type, i.e., array
           // for now, we only support fixed-length array that contains
           // geo point coordianates but we can support more types in this way
           if (needLinearizeAllFragments(
                   cd, *col_id, ra_exe_unit, selected_fragments, memory_level)) {
             bool for_lazy_fetch = false;
             if (plan_state_->isColumnToNotFetch(tbl_col_key)) {
               for_lazy_fetch = true;
               VLOG(2) << "Try to linearize lazy fetch column (col_id: " << cd->columnId
                       << ", col_name: " << cd->columnName << ")";
             }
             frag_col_buffers[it->second] = column_fetcher.linearizeColumnFragments(
                 col_id->getScanDesc().getTableKey(),
                 col_id->getColId(),
                 all_tables_fragments,
                 chunks,
                 chunk_iterators,
                 for_lazy_fetch ? Data_Namespace::CPU_LEVEL : memory_level,
                 for_lazy_fetch ? 0 : device_id,
                 device_allocator,
                 thread_idx);
           } else {
             frag_col_buffers[it->second] = column_fetcher.getAllTableColumnFragments(
                 col_id->getScanDesc().getTableKey(),
                 col_id->getColId(),
                 all_tables_fragments,
                 memory_level_for_column,
                 device_id,
                 device_allocator,
                 thread_idx);
           }
         } else {
           frag_col_buffers[it->second] = column_fetcher.getOneTableColumnFragment(
               col_id->getScanDesc().getTableKey(),
               frag_id,
               col_id->getColId(),
               all_tables_fragments,
               chunks,
               chunk_iterators,
               memory_level_for_column,
               device_id,
               device_allocator);
         }
       }
     }
     all_frag_col_buffers.push_back(frag_col_buffers);
   }
   std::tie(all_num_rows, all_frag_offsets) = getRowCountAndOffsetForAllFrags(
       ra_exe_unit, frag_ids_crossjoin, ra_exe_unit.input_descs, all_tables_fragments);
   return {all_frag_col_buffers, all_num_rows, all_frag_offsets};
 }

Here is the call graph for this function:

FetchResult Executor::fetchUnionChunks	(	const ColumnFetcher &	column_fetcher,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const int	device_id,
		const Data_Namespace::MemoryLevel	memory_level,
		const std::map< shared::TableKey, const TableFragments * > &	all_tables_fragments,
		const FragmentsList &	selected_fragments,
		std::list< ChunkIter > &	chunk_iterators,
		std::list< std::shared_ptr< Chunk_NS::Chunk >> &	chunks,
		DeviceAllocator *	device_allocator,
		const size_t	thread_idx,
		const bool	allow_runtime_interrupt
	)

private

Definition at line 3642 of file Execute.cpp.

References buildSelectedFragsMappingForUnion(), CHECK, CHECK_EQ, CHECK_LE, CHECK_LT, checkIsQuerySessionInterrupted(), Data_Namespace::CPU_LEVEL, DEBUG_TIMER, executor_session_mutex_, anonymous_namespace{Execute.cpp}::get_selected_input_col_descs(), anonymous_namespace{Execute.cpp}::get_selected_input_col_descs_index(), anonymous_namespace{Execute.cpp}::get_selected_input_descs_index(), ColumnFetcher::getAllTableColumnFragments(), getCurrentQuerySession(), ColumnFetcher::getOneTableColumnFragment(), ColumnFetcher::getResultSetColumn(), getRowCountAndOffsetForAllFrags(), INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, needFetchAllFragments(), plan_state_, shared::printContainer(), RESULT, anonymous_namespace{Execute.cpp}::set_mod_range(), anonymous_namespace{Execute.cpp}::try_get_column_descriptor(), and VLOG.

                                         {
   auto timer = DEBUG_TIMER(__func__);
   INJECT_TIMER(fetchUnionChunks);
 
   CHECK_EQ(1u, selected_fragments.size());
   CHECK_LE(2u, ra_exe_unit.input_descs.size());
   CHECK_LE(2u, ra_exe_unit.input_col_descs.size());
   auto const& input_descs = ra_exe_unit.input_descs;
   const auto& selected_table_key = selected_fragments.front().table_key;
   size_t const input_descs_index =
       get_selected_input_descs_index(selected_table_key, input_descs);
   CHECK_LT(input_descs_index, input_descs.size());
   size_t const input_col_descs_index =
       get_selected_input_col_descs_index(selected_table_key, ra_exe_unit.input_col_descs);
   CHECK_LT(input_col_descs_index, ra_exe_unit.input_col_descs.size());
   VLOG(2) << "selected_table_key=" << selected_table_key
           << " input_descs_index=" << input_descs_index
           << " input_col_descs_index=" << input_col_descs_index
           << " input_descs=" << shared::printContainer(input_descs)
           << " ra_exe_unit.input_col_descs="
           << shared::printContainer(ra_exe_unit.input_col_descs);
 
   std::list<std::shared_ptr<const InputColDescriptor>> selected_input_col_descs =
       get_selected_input_col_descs(selected_table_key, ra_exe_unit.input_col_descs);
   std::vector<std::vector<size_t>> selected_fragments_crossjoin;
 
   buildSelectedFragsMappingForUnion(
       selected_fragments_crossjoin, selected_fragments, ra_exe_unit);
 
   CartesianProduct<std::vector<std::vector<size_t>>> frag_ids_crossjoin(
       selected_fragments_crossjoin);
 
   if (allow_runtime_interrupt) {
     bool isInterrupted = false;
     {
       heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(
           executor_session_mutex_);
       const auto query_session = getCurrentQuerySession(session_read_lock);
       isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
     }
     if (isInterrupted) {
       throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
   }
   std::vector<const int8_t*> frag_col_buffers(
       plan_state_->global_to_local_col_ids_.size());
   for (const auto& col_id : selected_input_col_descs) {
     CHECK(col_id);
     const auto cd = try_get_column_descriptor(col_id.get());
     if (cd && cd->isVirtualCol) {
       CHECK_EQ("rowid", cd->columnName);
       continue;
     }
     const auto fragments_it = all_tables_fragments.find(selected_table_key);
     CHECK(fragments_it != all_tables_fragments.end());
     const auto fragments = fragments_it->second;
     auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
     CHECK(it != plan_state_->global_to_local_col_ids_.end());
     size_t const local_col_id = it->second;
     CHECK_LT(local_col_id, plan_state_->global_to_local_col_ids_.size());
     constexpr size_t frag_id = 0;
     if (fragments->empty()) {
       return {};
     }
     MemoryLevel const memory_level_for_column =
         plan_state_->isColumnToFetch({selected_table_key, col_id->getColId()})
             ? memory_level
             : Data_Namespace::CPU_LEVEL;
     int8_t const* ptr;
     if (col_id->getScanDesc().getSourceType() == InputSourceType::RESULT) {
       ptr = column_fetcher.getResultSetColumn(
           col_id.get(), memory_level_for_column, device_id, device_allocator, thread_idx);
     } else if (needFetchAllFragments(*col_id, ra_exe_unit, selected_fragments)) {
       ptr = column_fetcher.getAllTableColumnFragments(selected_table_key,
                                                       col_id->getColId(),
                                                       all_tables_fragments,
                                                       memory_level_for_column,
                                                       device_id,
                                                       device_allocator,
                                                       thread_idx);
     } else {
       ptr = column_fetcher.getOneTableColumnFragment(selected_table_key,
                                                      frag_id,
                                                      col_id->getColId(),
                                                      all_tables_fragments,
                                                      chunks,
                                                      chunk_iterators,
                                                      memory_level_for_column,
                                                      device_id,
                                                      device_allocator);
     }
     // Set frag_col_buffers[i]=ptr for i in mod input_descs.size() range of local_col_id.
     set_mod_range(frag_col_buffers, ptr, local_col_id, input_descs.size());
   }
   auto const [num_rows, frag_offsets] = getRowCountAndOffsetForAllFrags(
       ra_exe_unit, frag_ids_crossjoin, input_descs, all_tables_fragments);
 
   VLOG(2) << "frag_col_buffers=" << shared::printContainer(frag_col_buffers)
           << " num_rows=" << shared::printContainer(num_rows)
           << " frag_offsets=" << shared::printContainer(frag_offsets)
           << " input_descs_index=" << input_descs_index
           << " input_col_descs_index=" << input_col_descs_index;
   return {{std::move(frag_col_buffers)},
           {{num_rows[0][input_descs_index]}},
           {{frag_offsets[0][input_descs_index]}}};
 }

Here is the call graph for this function:

std::string Executor::generatePTX ( const std::string & cuda_llir ) const

private

Definition at line 1540 of file NativeCodegen.cpp.

                                                                 {
   return CodeGenerator::generatePTX(
       cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
 }

const ExecutorResourceMgr_Namespace::ConcurrentResourceGrantPolicy Executor::get_concurrent_resource_grant_policy ( const ExecutorResourceMgr_Namespace::ResourceType resource_type )

static

Definition at line 5467 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

                                                                    {
   if (!g_enable_executor_resource_mgr) {
     throw std::runtime_error(
         "ExecutorResourceMgr must be enabled to set executor concurrent resource grant "
         "policy.");
   }
   return executor_resource_mgr_->get_concurrent_resource_grant_policy(resource_type);
 }

ExecutorResourceMgr_Namespace::ResourcePoolInfo Executor::get_executor_resource_pool_info ( )

static

Definition at line 5448 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

Referenced by foreign_storage::InternalExecutorStatsDataWrapper::initializeObjectsForTable().

                                           {
   if (!g_enable_executor_resource_mgr) {
     throw std::runtime_error(
         "ExecutorResourceMgr must be enabled to obtain executor resource pool stats.");
   }
   return executor_resource_mgr_->get_resource_info();
 }

Here is the caller graph for this function:

size_t Executor::get_executor_resource_pool_total_resource_quantity ( const ExecutorResourceMgr_Namespace::ResourceType resource_type )

static

Definition at line 5438 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

                                                                    {
   if (!g_enable_executor_resource_mgr) {
     throw std::runtime_error(
         "ExecutorResourceMgr must be enabled to obtain executor resource pool stats.");
   }
   return executor_resource_mgr_->get_resource_info(resource_type).second;
 }

const std::unique_ptr<llvm::Module>& Executor::get_extension_module ( ExtModuleKinds kind ) const

inlineprivate

Definition at line 1504 of file Execute.h.

References extension_modules_.

Referenced by get_geos_module(), get_libdevice_module(), get_rt_module(), get_rt_udf_module(), and get_udf_module().

                                                                                    {
     auto it = extension_modules_.find(kind);
     if (it != extension_modules_.end()) {
       return it->second;
     }
     static const std::unique_ptr<llvm::Module> empty;
     return empty;
   }

Here is the caller graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_geos_module ( ) const

inline

Definition at line 545 of file Execute.h.

References get_extension_module(), and rt_geos_module.

                                                            {
     return get_extension_module(ExtModuleKinds::rt_geos_module);
   }

Here is the call graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_libdevice_module ( ) const

inline

Definition at line 548 of file Execute.h.

References get_extension_module(), and rt_libdevice_module.

                                                                 {
     return get_extension_module(ExtModuleKinds::rt_libdevice_module);
   }

Here is the call graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_rt_module ( ) const

inline

Definition at line 532 of file Execute.h.

References get_extension_module(), and template_module.

                                                          {
     return get_extension_module(ExtModuleKinds::template_module);
   }

Here is the call graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_rt_udf_module ( bool is_gpu = false ) const

inline

Definition at line 539 of file Execute.h.

References get_extension_module(), register_runtime_extension_functions_mutex_, rt_udf_cpu_module, and rt_udf_gpu_module.

                                                                                 {
     std::lock_guard<std::mutex> lock(
         Executor::register_runtime_extension_functions_mutex_);
     return get_extension_module(
         (is_gpu ? ExtModuleKinds::rt_udf_gpu_module : ExtModuleKinds::rt_udf_cpu_module));
   }

Here is the call graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_udf_module ( bool is_gpu = false ) const

inline

Definition at line 535 of file Execute.h.

References get_extension_module(), udf_cpu_module, and udf_gpu_module.

                                                                              {
     return get_extension_module(
         (is_gpu ? ExtModuleKinds::udf_gpu_module : ExtModuleKinds::udf_cpu_module));
   }

Here is the call graph for this function:

size_t Executor::getArenaBlockSize ( )

static

Definition at line 562 of file Execute.cpp.

References g_is_test_env, and kArenaBlockOverhead.

Referenced by ResultSetLogicalValuesBuilder::create(), RelAlgExecutor::prepareLeafExecution(), and setupCaching().

                                    {
   return g_is_test_env ? 100000000 : (1UL << 32) + kArenaBlockOverhead;
 }

Here is the caller graph for this function:

static size_t Executor::getBaselineThreshold	(	bool	for_count_distinct,
		ExecutorDeviceType	device_type
	)

inlinestatic

Definition at line 1448 of file Execute.h.

References baseline_threshold, and GPU.

Referenced by GroupByAndAggregate::getColRangeInfo().

                                                                      {
     return for_count_distinct ? (device_type == ExecutorDeviceType::GPU
                                      ? (Executor::baseline_threshold / 4)
                                      : Executor::baseline_threshold)
                               : Executor::baseline_threshold;
   }

Here is the caller graph for this function:

Executor::CachedCardinality Executor::getCachedCardinality ( const CardinalityCacheKey & cache_key )

Definition at line 5298 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, recycler_mutex_, and VLOG.

                                           {
   heavyai::shared_lock<heavyai::shared_mutex> lock(recycler_mutex_);
   if (g_use_estimator_result_cache &&
       cardinality_cache_.find(cache_key) != cardinality_cache_.end()) {
     VLOG(1) << "Reuse cached cardinality";
     return {true, cardinality_cache_[cache_key]};
   }
   return {false, -1};
 }

CgenState* Executor::getCgenStatePtr ( ) const

inline

Definition at line 1414 of file Execute.h.

References cgen_state_.

1414 { return cgen_state_.get(); }

Executor::cgen_state_

std::unique_ptr< CgenState > cgen_state_

Definition: Execute.h:1502

ExecutorResourceMgr_Namespace::ChunkRequestInfo Executor::getChunkRequestInfo	(	const ExecutorDeviceType	device_type,
		const std::vector< InputDescriptor > &	input_descs,
		const std::vector< InputTableInfo > &	query_infos,
		const std::vector< std::pair< int32_t, FragmentsList >> &	device_fragment_lists
	)		const

Determines a unique list of chunks and their associated byte sizes for a given query plan.

Called by Executor::launchKernelsViaResourceMgr

Note that we currently need the kernel's fragment lists generated in Executor::createKernels (which calls QueryFragmentDescriptor::buildFragmentKernelMap), but would be nice to hoist that logic out so that we could call this earlier, i.e. before compilation such that we don't waste compilation cycles in an attempt to run a query on GPU, only to see there are insufficient resources for it and it must be kicked to CPU

Note this method currently has two key limitations:

Only accounts for chunks in the lhs table if a join is involved.
Conservatively estimates that column widths for intermediate results are always 8 bytes, when in some cases they may have a lower byte width.

Parameters

device_type	- specifies whether the query needs CPU or GPU buffer pool memory
input_descs	- tables needed by the query
query_infos
kernel_fragment_lists

Returns: ExecutorResourceMgr_Namespace::ChunkRequestInfo - contains various info used by ExecutorResourceMgr to gate (and soon optimize scheduling of) query step resource requests.

Definition at line 877 of file Execute.cpp.

References gpu_enabled::accumulate(), CPU, and getColumnByteWidthMap().

Referenced by launchKernelsViaResourceMgr().

                                                                                    {
   using TableFragmentId = std::pair<shared::TableKey, int32_t>;
   using TableFragmentSizeMap = std::map<TableFragmentId, size_t>;
 
   /* Calculate bytes per column */
 
   // Only fetch lhs table ids for now...
   // Allows us to cleanly lower number of kernels in flight to save
   // buffer pool space, but is not a perfect estimate when big rhs
   // join tables are involved. Will revisit.
 
   std::set<shared::TableKey> lhs_table_keys;
   for (const auto& input_desc : input_descs) {
     if (input_desc.getNestLevel() == 0) {
       lhs_table_keys.insert(input_desc.getTableKey());
     }
   }
 
   const bool include_lazy_fetch_cols = device_type == ExecutorDeviceType::CPU;
   const auto column_byte_width_map =
       getColumnByteWidthMap(lhs_table_keys, include_lazy_fetch_cols);
 
   /* Calculate the byte width per row (sum of all columns widths)
      Assumes each fragment touches the same columns, which is a DB-wide
      invariant for now */
 
   size_t const byte_width_per_row =
       std::accumulate(column_byte_width_map.begin(),
                       column_byte_width_map.end(),
                       size_t(0),
                       [](size_t sum, auto& col_entry) { return sum + col_entry.second; });
 
   /* Calculate num tuples for all fragments */
 
   TableFragmentSizeMap all_table_fragments_size_map;
 
   for (auto& query_info : query_infos) {
     const auto& table_key = query_info.table_key;
     for (const auto& frag : query_info.info.fragments) {
       const int32_t frag_id = frag.fragmentId;
       const TableFragmentId table_frag_id = std::make_pair(table_key, frag_id);
       const size_t fragment_num_tuples = frag.getNumTuples();  // num_tuples;
       all_table_fragments_size_map.insert(
           std::make_pair(table_frag_id, fragment_num_tuples));
     }
   }
 
   /* Calculate num tuples only for fragments actually touched by query
      Also calculate the num bytes needed for each kernel */
 
   TableFragmentSizeMap query_table_fragments_size_map;
   std::vector<size_t> bytes_per_kernel;
   bytes_per_kernel.reserve(kernel_fragment_lists.size());
 
   size_t max_kernel_bytes{0};
 
   for (auto& kernel_frag_list : kernel_fragment_lists) {
     size_t kernel_bytes{0};
     const auto frag_list = kernel_frag_list.second;
     for (const auto& table_frags : frag_list) {
       const auto& table_key = table_frags.table_key;
       for (const size_t frag_id : table_frags.fragment_ids) {
         const TableFragmentId table_frag_id = std::make_pair(table_key, frag_id);
         const size_t fragment_num_tuples = all_table_fragments_size_map[table_frag_id];
         kernel_bytes += fragment_num_tuples * byte_width_per_row;
         query_table_fragments_size_map.insert(
             std::make_pair(table_frag_id, fragment_num_tuples));
       }
     }
     bytes_per_kernel.emplace_back(kernel_bytes);
     if (kernel_bytes > max_kernel_bytes) {
       max_kernel_bytes = kernel_bytes;
     }
   }
 
   /* Calculate bytes per chunk touched by the query */
 
   std::map<ChunkKey, size_t> all_chunks_byte_sizes_map;
   constexpr int32_t subkey_min = std::numeric_limits<int32_t>::min();
 
   for (const auto& col_byte_width_entry : column_byte_width_map) {
     // Build a chunk key prefix of (db_id, table_id, column_id)
     const int32_t db_id = col_byte_width_entry.first.db_id;
     const int32_t table_id = col_byte_width_entry.first.table_id;
     const int32_t col_id = col_byte_width_entry.first.column_id;
     const size_t col_byte_width = col_byte_width_entry.second;
     const shared::TableKey table_key(db_id, table_id);
 
     const auto frag_start =
         query_table_fragments_size_map.lower_bound({table_key, subkey_min});
     for (auto frag_itr = frag_start; frag_itr != query_table_fragments_size_map.end() &&
                                      frag_itr->first.first == table_key;
          frag_itr++) {
       const ChunkKey chunk_key = {db_id, table_id, col_id, frag_itr->first.second};
       const size_t chunk_byte_size = col_byte_width * frag_itr->second;
       all_chunks_byte_sizes_map.insert({chunk_key, chunk_byte_size});
     }
   }
 
   size_t total_chunk_bytes{0};
   const size_t num_chunks = all_chunks_byte_sizes_map.size();
   std::vector<std::pair<ChunkKey, size_t>> chunks_with_byte_sizes;
   chunks_with_byte_sizes.reserve(num_chunks);
   for (const auto& chunk_byte_size_entry : all_chunks_byte_sizes_map) {
     chunks_with_byte_sizes.emplace_back(
         std::make_pair(chunk_byte_size_entry.first, chunk_byte_size_entry.second));
     // Add here, post mapping of the chunks, to make sure chunks are deduped and we get an
     // accurate size estimate
     total_chunk_bytes += chunk_byte_size_entry.second;
   }
   // Don't allow scaling of bytes per kernel launches for GPU yet as we're not set up for
   // this at this point
   const bool bytes_scales_per_kernel = device_type == ExecutorDeviceType::CPU;
 
   // Return ChunkRequestInfo
 
   return {device_type,
           chunks_with_byte_sizes,
           num_chunks,
           total_chunk_bytes,
           bytes_per_kernel,
           max_kernel_bytes,
           bytes_scales_per_kernel};
 }

Here is the call graph for this function:

Here is the caller graph for this function:

std::vector< ColumnLazyFetchInfo > Executor::getColLazyFetchInfo ( const std::vector< Analyzer::Expr * > & target_exprs ) const

Definition at line 1017 of file Execute.cpp.

References CHECK, get_column_descriptor(), get_column_descriptor_maybe(), IS_GEO, kNULLT, and plan_state_.

Referenced by createKernels().

                                                         {
   CHECK(plan_state_);
   std::vector<ColumnLazyFetchInfo> col_lazy_fetch_info;
   for (const auto target_expr : target_exprs) {
     if (!plan_state_->isLazyFetchColumn(target_expr)) {
       col_lazy_fetch_info.emplace_back(
           ColumnLazyFetchInfo{false, -1, SQLTypeInfo(kNULLT, false)});
     } else {
       const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
       CHECK(col_var);
       auto rte_idx = (col_var->get_rte_idx() == -1) ? 0 : col_var->get_rte_idx();
       const auto cd = get_column_descriptor_maybe(col_var->getColumnKey());
       if (cd && IS_GEO(cd->columnType.get_type())) {
         // Geo coords cols will be processed in sequence. So we only need to track the
         // first coords col in lazy fetch info.
         {
           auto col_key = col_var->getColumnKey();
           col_key.column_id += 1;
           const auto cd0 = get_column_descriptor(col_key);
           const auto col0_ti = cd0->columnType;
           CHECK(!cd0->isVirtualCol);
           const auto col0_var = makeExpr<Analyzer::ColumnVar>(col0_ti, col_key, rte_idx);
           const auto local_col0_id = plan_state_->getLocalColumnId(col0_var.get(), false);
           col_lazy_fetch_info.emplace_back(
               ColumnLazyFetchInfo{true, local_col0_id, col0_ti});
         }
       } else {
         auto local_col_id = plan_state_->getLocalColumnId(col_var, false);
         const auto& col_ti = col_var->get_type_info();
         col_lazy_fetch_info.emplace_back(ColumnLazyFetchInfo{true, local_col_id, col_ti});
       }
     }
   }
   return col_lazy_fetch_info;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

ExpressionRange Executor::getColRange ( const PhysicalInput & phys_input ) const

Definition at line 746 of file Execute.cpp.

References agg_col_range_cache_, and AggregatedColRange::getColRange().

                                                                            {
   return agg_col_range_cache_.getColRange(phys_input);
 }

Here is the call graph for this function:

std::map< shared::ColumnKey, size_t > Executor::getColumnByteWidthMap	(	const std::set< shared::TableKey > &	table_ids_to_fetch,
		const bool	include_lazy_fetched_cols
	)		const

Definition at line 819 of file Execute.cpp.

References CHECK, anonymous_namespace{Execute.cpp}::get_col_byte_width(), and plan_state_.

Referenced by getChunkRequestInfo().

                                                 {
   std::map<shared::ColumnKey, size_t> col_byte_width_map;
 
   for (const auto& fetched_col : plan_state_->getColumnsToFetch()) {
     if (table_ids_to_fetch.count({fetched_col.db_id, fetched_col.table_id}) == 0) {
       continue;
     }
     const size_t col_byte_width = get_col_byte_width(fetched_col);
     CHECK(col_byte_width_map.insert({fetched_col, col_byte_width}).second);
   }
   if (include_lazy_fetched_cols) {
     for (const auto& lazy_fetched_col : plan_state_->getColumnsToNotFetch()) {
       if (table_ids_to_fetch.count({lazy_fetched_col.db_id, lazy_fetched_col.table_id}) ==
           0) {
         continue;
       }
       const size_t col_byte_width = get_col_byte_width(lazy_fetched_col);
       CHECK(col_byte_width_map.insert({lazy_fetched_col, col_byte_width}).second);
     }
   }
   return col_byte_width_map;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

const ColumnDescriptor * Executor::getColumnDescriptor ( const Analyzer::ColumnVar * col_var ) const

Definition at line 711 of file Execute.cpp.

References get_column_descriptor_maybe(), and Analyzer::ColumnVar::getColumnKey().

Referenced by getPhysicalColumnDescriptor().

                                             {
   return get_column_descriptor_maybe(col_var->getColumnKey());
 }

Here is the call graph for this function:

Here is the caller graph for this function:

llvm::LLVMContext& Executor::getContext ( )

inline

Definition at line 1417 of file Execute.h.

References context_.

1417 { return *context_.get(); }

Executor::context_

std::unique_ptr< llvm::LLVMContext > context_

Definition: Execute.h:1477

QuerySessionId & Executor::getCurrentQuerySession ( heavyai::shared_lock< heavyai::shared_mutex > & read_lock )

Definition at line 4986 of file Execute.cpp.

References current_query_session_.

Referenced by executePlanWithGroupBy(), executePlanWithoutGroupBy(), executeWorkUnitImpl(), fetchChunks(), and fetchUnionChunks().

                                                         {
   return current_query_session_;
 }

Here is the caller graph for this function:

Data_Namespace::DataMgr* Executor::getDataMgr ( ) const

inline

Definition at line 623 of file Execute.h.

References CHECK, and data_mgr_.

Referenced by getDeviceTypeForTargets(), logSystemCPUMemoryStatus(), and logSystemGPUMemoryStatus().

                                             {
     CHECK(data_mgr_);
     return data_mgr_;
   }

Here is the caller graph for this function:

heavyai::shared_mutex & Executor::getDataRecyclerLock ( )

Definition at line 4970 of file Execute.cpp.

References recycler_mutex_.

                                                    {
   return recycler_mutex_;
 }

ExecutorDeviceType Executor::getDeviceTypeForTargets	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const ExecutorDeviceType	requested_device_type
	)

private

Definition at line 2575 of file Execute.cpp.

References CPU, g_bigint_count, get_target_info(), getDataMgr(), RelAlgExecutionUnit::groupby_exprs, isArchPascalOrLater(), kAVG, kDOUBLE, kSUM, kSUM_IF, and RelAlgExecutionUnit::target_exprs.

Referenced by executeWorkUnitImpl().

                                                     {
   if (!getDataMgr()->gpusPresent()) {
     return ExecutorDeviceType::CPU;
   }
   for (const auto target_expr : ra_exe_unit.target_exprs) {
     const auto agg_info = get_target_info(target_expr, g_bigint_count);
     if (!ra_exe_unit.groupby_exprs.empty() &&
         !isArchPascalOrLater(requested_device_type)) {
       if ((agg_info.agg_kind == kAVG || agg_info.agg_kind == kSUM ||
            agg_info.agg_kind == kSUM_IF) &&
           agg_info.agg_arg_type.get_type() == kDOUBLE) {
         return ExecutorDeviceType::CPU;
       }
     }
     if (dynamic_cast<const Analyzer::RegexpExpr*>(target_expr)) {
       return ExecutorDeviceType::CPU;
     }
   }
   return requested_device_type;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

std::shared_ptr< Executor > Executor::getExecutor	(	const ExecutorId	id,
		const std::string &	debug_dir = `""`,
		const std::string &	debug_file = `""`,
		const SystemParameters &	system_parameters = `SystemParameters()`
	)

static

Definition at line 513 of file Execute.cpp.

References CHECK, SystemParameters::cuda_block_size, SystemParameters::cuda_grid_size, executors_, executors_cache_mutex_, Catalog_Namespace::SysCatalog::getDataMgr(), Catalog_Namespace::SysCatalog::instance(), and SystemParameters::max_gpu_slab_size.

                                                {
   heavyai::unique_lock<heavyai::shared_mutex> write_lock(executors_cache_mutex_);
   auto it = executors_.find(executor_id);
   if (it != executors_.end()) {
     return it->second;
   }
   auto& data_mgr = Catalog_Namespace::SysCatalog::instance().getDataMgr();
   auto executor = std::make_shared<Executor>(executor_id,
                                              &data_mgr,
                                              system_parameters.cuda_block_size,
                                              system_parameters.cuda_grid_size,
                                              system_parameters.max_gpu_slab_size,
                                              debug_dir,
                                              debug_file);
   CHECK(executors_.insert(std::make_pair(executor_id, executor)).second);
   return executor;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

ExecutorId Executor::getExecutorId ( ) const

inline

Definition at line 1332 of file Execute.h.

References executor_id_.

Referenced by launchKernelsViaResourceMgr().

1332 { return executor_id_; };

Executor::executor_id_

const ExecutorId executor_id_

Definition: Execute.h:1476

Here is the caller graph for this function:

const std::vector< size_t > Executor::getExecutorIdsRunningQuery ( const QuerySessionId & interrupt_session ) const

Definition at line 5347 of file Execute.cpp.

References executor_session_mutex_, queries_session_map_, and run_benchmark_import::res.

                                                    {
   std::vector<size_t> res;
   heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(executor_session_mutex_);
   auto it = queries_session_map_.find(interrupt_session);
   if (it != queries_session_map_.end()) {
     for (auto& kv : it->second) {
       if (kv.second.getQueryStatus() ==
           QuerySessionStatus::QueryStatus::RUNNING_QUERY_KERNEL) {
         res.push_back(kv.second.getExecutorId());
       }
     }
   }
   return res;
 }

const SQLTypeInfo Executor::getFirstOrderColTypeInfo ( WindowFunctionContext * window_func_context ) const

private

Definition at line 732 of file WindowFunctionIR.cpp.

References Analyzer::WindowFunction::getOrderKeys(), and WindowFunctionContext::getWindowFunction().

                                                       {
   const auto window_func = window_func_context->getWindowFunction();
   return window_func->getOrderKeys().front()->get_type_info();
 }

Here is the call graph for this function:

std::vector< size_t > Executor::getFragmentCount	(	const FragmentsList &	selected_fragments,
		const size_t	scan_idx,
		const RelAlgExecutionUnit &	ra_exe_unit
	)

private

Definition at line 3760 of file Execute.cpp.

References RelAlgExecutionUnit::input_descs, RelAlgExecutionUnit::join_quals, and plan_state_.

Referenced by buildSelectedFragsMapping().

                                                                                        {
   if ((ra_exe_unit.input_descs.size() > size_t(2) || !ra_exe_unit.join_quals.empty()) &&
       scan_idx > 0 &&
       !plan_state_->join_info_.sharded_range_table_indices_.count(scan_idx) &&
       !selected_fragments[scan_idx].fragment_ids.empty()) {
     // Fetch all fragments
     return {size_t(0)};
   }
 
   return selected_fragments[scan_idx].fragment_ids;
 }

Here is the caller graph for this function:

std::string Executor::getFramingFuncName	(	const std::string &	bound_type,
		const std::string &	order_col_type,
		const std::string &	op_type,
		bool	for_timestamp_type
	)		const

private

Definition at line 847 of file WindowFunctionIR.cpp.

                                                                         {
   auto target_val_type = for_timestamp_type ? "int64_t" : order_col_type;
   auto null_type = for_timestamp_type ? "int64_t" : order_col_type;
   return "range_mode_" + target_val_type + "_" + order_col_type + "_" + null_type + "_" +
          op_type + "_frame_" + bound_type + "_bound";
 }

std::unordered_map< shared::TableKey, const Analyzer::BinOper * > Executor::getInnerTabIdToJoinCond ( ) const

private

Definition at line 2882 of file Execute.cpp.

References CHECK_EQ, and plan_state_.

                                         {
   std::unordered_map<shared::TableKey, const Analyzer::BinOper*> id_to_cond;
   const auto& join_info = plan_state_->join_info_;
   CHECK_EQ(join_info.equi_join_tautologies_.size(), join_info.join_hash_tables_.size());
   for (size_t i = 0; i < join_info.join_hash_tables_.size(); ++i) {
     const auto& inner_table_key = join_info.join_hash_tables_[i]->getInnerTableId();
     id_to_cond.insert(
         std::make_pair(inner_table_key, join_info.equi_join_tautologies_[i].get()));
   }
   return id_to_cond;
 }

std::vector< int8_t * > Executor::getJoinHashTablePtrs	(	const ExecutorDeviceType	device_type,
		const int	device_id
	)

private

Definition at line 4253 of file Execute.cpp.

References CHECK, GPU, and plan_state_.

Referenced by executePlanWithGroupBy(), and executePlanWithoutGroupBy().

                                                                          {
   std::vector<int8_t*> table_ptrs;
   const auto& join_hash_tables = plan_state_->join_info_.join_hash_tables_;
   for (auto hash_table : join_hash_tables) {
     if (!hash_table) {
       CHECK(table_ptrs.empty());
       return {};
     }
     table_ptrs.push_back(hash_table->getJoinHashBuffer(
         device_type, device_type == ExecutorDeviceType::GPU ? device_id : 0));
   }
   return table_ptrs;
 }

Here is the caller graph for this function:

const StringDictionaryProxy::IdMap * Executor::getJoinIntersectionStringProxyTranslationMap	(	const StringDictionaryProxy *	source_proxy,
		StringDictionaryProxy *	dest_proxy,
		const std::vector< StringOps_Namespace::StringOpInfo > &	source_string_op_infos,
		const std::vector< StringOps_Namespace::StringOpInfo > &	dest_source_string_op_infos,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner
	)		const

Definition at line 621 of file Execute.cpp.

References CHECK, and str_dict_mutex_.

                                                               {
   CHECK(row_set_mem_owner);
   std::lock_guard<std::mutex> lock(
       str_dict_mutex_);  // TODO: can we use RowSetMemOwner state mutex here?
   // First translate lhs onto itself if there are string ops
   if (!dest_string_op_infos.empty()) {
     row_set_mem_owner->addStringProxyUnionTranslationMap(
         dest_proxy, dest_proxy, dest_string_op_infos);
   }
   return row_set_mem_owner->addStringProxyIntersectionTranslationMap(
       source_proxy, dest_proxy, source_string_op_infos);
 }

const QueryPlanDAG Executor::getLatestQueryPlanDagExtracted ( ) const

Definition at line 5382 of file Execute.cpp.

References latest_query_plan_extracted_, and recycler_mutex_.

                                                                   {
   heavyai::shared_lock<heavyai::shared_mutex> lock(recycler_mutex_);
   return latest_query_plan_extracted_;
 }

size_t Executor::getNumBytesForFetchedRow ( const std::set< shared::TableKey > & table_keys_to_fetch ) const

size_t Executor::getNumBytesForFetchedRow ( const std::set< int > & table_ids_to_fetch ) const

size_t Executor::getNumCurentSessionsEnrolled ( ) const

Definition at line 5115 of file Execute.cpp.

References executor_session_mutex_, and queries_session_map_.

                                                     {
   heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(executor_session_mutex_);
   return queries_session_map_.size();
 }

size_t Executor::getOrderKeySize ( WindowFunctionContext * window_func_context ) const

private

Definition at line 738 of file WindowFunctionIR.cpp.

                                                                                  {
   const auto order_key_size = getFirstOrderColTypeInfo(window_func_context).get_size();
   return order_key_size;
 }

const std::string Executor::getOrderKeyTypeName ( WindowFunctionContext * window_func_context ) const

private

Definition at line 743 of file WindowFunctionIR.cpp.

References CHECK, anonymous_namespace{WindowFunctionIR.cpp}::get_col_type_name_by_size(), Analyzer::WindowFunction::getOrderKeys(), and WindowFunctionContext::getWindowFunction().

                                                       {
   auto const order_key_size = getOrderKeySize(window_func_context);
   auto const order_key_ptr =
       window_func_context->getWindowFunction()->getOrderKeys().front();
   CHECK(order_key_ptr);
   return get_col_type_name_by_size(order_key_size,
                                    order_key_ptr->get_type_info().is_fp());
 }

Here is the call graph for this function:

const ColumnDescriptor * Executor::getPhysicalColumnDescriptor	(	const Analyzer::ColumnVar *	col_var,
		int	n
	)		const

Definition at line 716 of file Execute.cpp.

References shared::ColumnKey::column_id, get_column_descriptor_maybe(), getColumnDescriptor(), Analyzer::ColumnVar::getColumnKey(), and anonymous_namespace{Utm.h}::n.

                  {
   const auto cd = getColumnDescriptor(col_var);
   if (!cd || n > cd->columnType.get_physical_cols()) {
     return nullptr;
   }
   auto column_key = col_var->getColumnKey();
   column_key.column_id += n;
   return get_column_descriptor_maybe(column_key);
 }

Here is the call graph for this function:

PlanState* Executor::getPlanStatePtr ( ) const

inline

Definition at line 1415 of file Execute.h.

References plan_state_.

1415 { return plan_state_.get(); }

Executor::plan_state_

std::unique_ptr< PlanState > plan_state_

Definition: Execute.h:1532

QueryPlanDagCache & Executor::getQueryPlanDagCache ( )

Definition at line 4974 of file Execute.cpp.

References query_plan_dag_cache_.

                                                   {
   return query_plan_dag_cache_;
 }

std::vector< QuerySessionStatus > Executor::getQuerySessionInfo	(	const QuerySessionId &	query_session,
		heavyai::shared_lock< heavyai::shared_mutex > &	read_lock
	)

Definition at line 5329 of file Execute.cpp.

References queries_session_map_.

                                                         {
   if (!queries_session_map_.empty() && queries_session_map_.count(query_session)) {
     auto& query_infos = queries_session_map_.at(query_session);
     std::vector<QuerySessionStatus> ret;
     for (auto& info : query_infos) {
       ret.emplace_back(query_session,
                        info.second.getExecutorId(),
                        info.second.getQueryStr(),
                        info.second.getQuerySubmittedTime(),
                        info.second.getQueryStatus());
     }
     return ret;
   }
   return {};
 }

QuerySessionStatus::QueryStatus Executor::getQuerySessionStatus	(	const QuerySessionId &	candidate_query_session,
		heavyai::shared_lock< heavyai::shared_mutex > &	read_lock
	)

Definition at line 5001 of file Execute.cpp.

References queries_session_map_.

                                                         {
   if (queries_session_map_.count(candidate_query_session) &&
       !queries_session_map_.at(candidate_query_session).empty()) {
     return queries_session_map_.at(candidate_query_session)
         .begin()
         ->second.getQueryStatus();
   }
   return QuerySessionStatus::QueryStatus::UNDEFINED;
 }

ResultSetRecyclerHolder & Executor::getResultSetRecyclerHolder ( )

Definition at line 4978 of file Execute.cpp.

References resultset_recycler_holder_.

                                                               {
   return resultset_recycler_holder_;
 }

std::pair< std::vector< std::vector< int64_t > >, std::vector< std::vector< uint64_t > > > Executor::getRowCountAndOffsetForAllFrags	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const CartesianProduct< std::vector< std::vector< size_t >>> &	frag_ids_crossjoin,
		const std::vector< InputDescriptor > &	input_descs,
		const std::map< shared::TableKey, const TableFragments * > &	all_tables_fragments
	)

private

Definition at line 3367 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LT, get_table_id_to_frag_offsets(), RelAlgExecutionUnit::join_quals, plan_state_, and RelAlgExecutionUnit::union_all.

Referenced by fetchChunks(), and fetchUnionChunks().

                                                                                {
   std::vector<std::vector<int64_t>> all_num_rows;
   std::vector<std::vector<uint64_t>> all_frag_offsets;
   const auto tab_id_to_frag_offsets =
       get_table_id_to_frag_offsets(input_descs, all_tables_fragments);
   std::unordered_map<size_t, size_t> outer_id_to_num_row_idx;
   for (const auto& selected_frag_ids : frag_ids_crossjoin) {
     std::vector<int64_t> num_rows;
     std::vector<uint64_t> frag_offsets;
     if (!ra_exe_unit.union_all) {
       CHECK_EQ(selected_frag_ids.size(), input_descs.size());
     }
     for (size_t tab_idx = 0; tab_idx < input_descs.size(); ++tab_idx) {
       const auto frag_id = ra_exe_unit.union_all ? 0 : selected_frag_ids[tab_idx];
       const auto fragments_it =
           all_tables_fragments.find(input_descs[tab_idx].getTableKey());
       CHECK(fragments_it != all_tables_fragments.end());
       const auto& fragments = *fragments_it->second;
       if (ra_exe_unit.join_quals.empty() || tab_idx == 0 ||
           plan_state_->join_info_.sharded_range_table_indices_.count(tab_idx)) {
         const auto& fragment = fragments[frag_id];
         num_rows.push_back(fragment.getNumTuples());
       } else {
         size_t total_row_count{0};
         for (const auto& fragment : fragments) {
           total_row_count += fragment.getNumTuples();
         }
         num_rows.push_back(total_row_count);
       }
       const auto frag_offsets_it =
           tab_id_to_frag_offsets.find(input_descs[tab_idx].getTableKey());
       CHECK(frag_offsets_it != tab_id_to_frag_offsets.end());
       const auto& offsets = frag_offsets_it->second;
       CHECK_LT(frag_id, offsets.size());
       frag_offsets.push_back(offsets[frag_id]);
     }
     all_num_rows.push_back(num_rows);
     // Fragment offsets of outer table should be ONLY used by rowid for now.
     all_frag_offsets.push_back(frag_offsets);
   }
   return {all_num_rows, all_frag_offsets};
 }

Here is the call graph for this function:

Here is the caller graph for this function:

const std::shared_ptr< RowSetMemoryOwner > Executor::getRowSetMemoryOwner ( ) const

Definition at line 728 of file Execute.cpp.

References row_set_mem_owner_.

Referenced by executeTableFunction(), TransientStringLiteralsVisitor::visitStringOper(), and TransientStringLiteralsVisitor::visitUOper().

                                                                             {
   return row_set_mem_owner_;
 }

Here is the caller graph for this function:

heavyai::shared_mutex & Executor::getSessionLock ( )

Definition at line 4982 of file Execute.cpp.

References executor_session_mutex_.

                                               {
   return executor_session_mutex_;
 }

StringDictionaryProxy* Executor::getStringDictionaryProxy	(	const shared::StringDictKey &	dict_key,
		const bool	with_generation
	)		const

inline

Returns a string dictionary proxy using the currently active row set memory owner.

Definition at line 578 of file Execute.h.

References CHECK, and row_set_mem_owner_.

Referenced by addTransientStringLiterals(), and serializeLiterals().

                                                                                     {
     CHECK(row_set_mem_owner_);
     return getStringDictionaryProxy(dict_key, row_set_mem_owner_, with_generation);
   }

Here is the caller graph for this function:

StringDictionaryProxy* Executor::getStringDictionaryProxy	(	const shared::StringDictKey &	dict_key,
		const std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		const bool	with_generation
	)		const

const StringDictionaryProxy::TranslationMap< Datum > * Executor::getStringProxyNumericTranslationMap	(	const shared::StringDictKey &	source_dict_key,
		const std::vector< StringOps_Namespace::StringOpInfo > &	string_op_infos,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		const bool	with_generation
	)		const

Definition at line 640 of file Execute.cpp.

References CHECK, and str_dict_mutex_.

                                       {
   CHECK(row_set_mem_owner);
   std::lock_guard<std::mutex> lock(
       str_dict_mutex_);  // TODO: can we use RowSetMemOwner state mutex here?
   return row_set_mem_owner->getOrAddStringProxyNumericTranslationMap(
       source_dict_key, with_generation, string_op_infos);
 }

const StringDictionaryProxy::IdMap * Executor::getStringProxyTranslationMap	(	const shared::StringDictKey &	source_dict_key,
		const shared::StringDictKey &	dest_dict_key,
		const RowSetMemoryOwner::StringTranslationType	translation_type,
		const std::vector< StringOps_Namespace::StringOpInfo > &	string_op_infos,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		const bool	with_generation
	)		const

Definition at line 606 of file Execute.cpp.

References CHECK, and str_dict_mutex_.

Referenced by TransientStringLiteralsVisitor::visitStringOper(), and TransientStringLiteralsVisitor::visitUOper().

                                       {
   CHECK(row_set_mem_owner);
   std::lock_guard<std::mutex> lock(
       str_dict_mutex_);  // TODO: can we use RowSetMemOwner state mutex here?
   return row_set_mem_owner->getOrAddStringProxyTranslationMap(
       source_dict_key, dest_dict_key, with_generation, translation_type, string_op_infos);
 }

Here is the caller graph for this function:

std::vector< size_t > Executor::getTableFragmentIndices	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const ExecutorDeviceType	device_type,
		const size_t	table_idx,
		const size_t	outer_frag_idx,
		std::map< shared::TableKey, const TableFragments * > &	selected_tables_fragments,
		const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &	inner_table_id_to_join_condition
	)

private

Definition at line 3236 of file Execute.cpp.

References CHECK, CHECK_LT, RelAlgExecutionUnit::input_descs, and skipFragmentPair().

                                           {
   const auto& table_key = ra_exe_unit.input_descs[table_idx].getTableKey();
   auto table_frags_it = selected_tables_fragments.find(table_key);
   CHECK(table_frags_it != selected_tables_fragments.end());
   const auto& outer_input_desc = ra_exe_unit.input_descs[0];
   const auto outer_table_fragments_it =
       selected_tables_fragments.find(outer_input_desc.getTableKey());
   const auto outer_table_fragments = outer_table_fragments_it->second;
   CHECK(outer_table_fragments_it != selected_tables_fragments.end());
   CHECK_LT(outer_frag_idx, outer_table_fragments->size());
   if (!table_idx) {
     return {outer_frag_idx};
   }
   const auto& outer_fragment_info = (*outer_table_fragments)[outer_frag_idx];
   auto& inner_frags = table_frags_it->second;
   CHECK_LT(size_t(1), ra_exe_unit.input_descs.size());
   std::vector<size_t> all_frag_ids;
   for (size_t inner_frag_idx = 0; inner_frag_idx < inner_frags->size();
        ++inner_frag_idx) {
     const auto& inner_frag_info = (*inner_frags)[inner_frag_idx];
     if (skipFragmentPair(outer_fragment_info,
                          inner_frag_info,
                          table_idx,
                          inner_table_id_to_join_condition,
                          ra_exe_unit,
                          device_type)) {
       continue;
     }
     all_frag_ids.push_back(inner_frag_idx);
   }
   return all_frag_ids;
 }

Here is the call graph for this function:

const TableGeneration & Executor::getTableGeneration ( const shared::TableKey & table_key ) const

Definition at line 741 of file Execute.cpp.

References TableGenerations::getGeneration(), and table_generations_.

Referenced by skipFragment().

                                            {
   return table_generations_.getGeneration(table_key);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

Fragmenter_Namespace::TableInfo Executor::getTableInfo ( const shared::TableKey & table_key ) const

Definition at line 736 of file Execute.cpp.

References InputTableInfoCache::getTableInfo(), and input_table_info_cache_.

Referenced by computeColRangesCache(), and computeTableGenerations().

                                            {
   return input_table_info_cache_.getTableInfo(table_key);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

const TemporaryTables* Executor::getTemporaryTables ( )

inline

Returns pointer to the intermediate tables vector currently stored by this executor.

Definition at line 573 of file Execute.h.

References temporary_tables_.

Referenced by skipFragmentPair().

573 { return temporary_tables_; }

Executor::temporary_tables_

const TemporaryTables * temporary_tables_

Definition: Execute.h:1559

Here is the caller graph for this function:

const TemporaryTables* Executor::getTemporaryTables ( ) const

std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > Executor::getUniqueThreadSharedResultSets ( const std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> & results_per_device ) const

private

Definition at line 1624 of file Execute.cpp.

References gpu_enabled::accumulate(), and run_benchmark_import::result.

Referenced by reduceMultiDeviceResults().

           {
   std::vector<std::pair<ResultSetPtr, std::vector<size_t>>> unique_thread_results;
   if (results_per_device.empty()) {
     return unique_thread_results;
   }
   auto max_ti = [](int acc, auto& e) { return std::max(acc, e.first->getThreadIdx()); };
   int const max_thread_idx =
       std::accumulate(results_per_device.begin(), results_per_device.end(), -1, max_ti);
   std::vector<bool> seen_thread_idxs(max_thread_idx + 1, false);
   for (const auto& result : results_per_device) {
     const int32_t result_thread_idx = result.first->getThreadIdx();
     if (!seen_thread_idxs[result_thread_idx]) {
       seen_thread_idxs[result_thread_idx] = true;
       unique_thread_results.emplace_back(result);
     }
   }
   return unique_thread_results;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

unsigned Executor::gridSize ( ) const

Definition at line 4352 of file Execute.cpp.

References CHECK, data_mgr_, Data_Namespace::DataMgr::getCudaMgr(), and grid_size_x_.

Referenced by collectAllDeviceShardedTopResults(), executePlanWithGroupBy(), executePlanWithoutGroupBy(), executeTableFunction(), executeWorkUnitImpl(), reduceMultiDeviceResults(), reduceMultiDeviceResultSets(), and resultsUnion().

                                   {
   CHECK(data_mgr_);
   const auto cuda_mgr = data_mgr_->getCudaMgr();
   if (!cuda_mgr) {
     return 0;
   }
   return grid_size_x_ ? grid_size_x_ : 2 * cuda_mgr->getMinNumMPsForAllDevices();
 }

Here is the call graph for this function:

Here is the caller graph for this function:

Executor::GroupColLLVMValue Executor::groupByColumnCodegen	(	Analyzer::Expr *	group_by_col,
		const size_t	col_width,
		const CompilationOptions &	co,
		const bool	translate_null_val,
		const int64_t	translated_null_val,
		DiamondCodegen &	diamond_codegen,
		std::stack< llvm::BasicBlock * > &	array_loops,
		const bool	thread_mem_shared
	)

private

Definition at line 1384 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_GE, CodeGenerator::codegen(), CompilationOptions::device_type, get_int_type(), Analyzer::Expr::get_type_info(), kDOUBLE, kUNNEST, log2_bytes(), need_patch_unnest_double(), numeric_type_name(), DiamondCodegen::orig_cond_false_, CodeGenerator::posArg(), and DiamondCodegen::setFalseTarget().

                                   {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   CHECK_GE(col_width, sizeof(int32_t));
   CodeGenerator code_generator(this);
   auto group_key = code_generator.codegen(group_by_col, true, co).front();
   auto key_to_cache = group_key;
   if (dynamic_cast<Analyzer::UOper*>(group_by_col) &&
       static_cast<Analyzer::UOper*>(group_by_col)->get_optype() == kUNNEST) {
     auto preheader = cgen_state_->ir_builder_.GetInsertBlock();
     auto array_loop_head = llvm::BasicBlock::Create(cgen_state_->context_,
                                                     "array_loop_head",
                                                     cgen_state_->current_func_,
                                                     preheader->getNextNode());
     diamond_codegen.setFalseTarget(array_loop_head);
     const auto ret_ty = get_int_type(32, cgen_state_->context_);
     auto array_idx_ptr = cgen_state_->ir_builder_.CreateAlloca(ret_ty);
     CHECK(array_idx_ptr);
     cgen_state_->ir_builder_.CreateStore(cgen_state_->llInt(int32_t(0)), array_idx_ptr);
     const auto arr_expr = static_cast<Analyzer::UOper*>(group_by_col)->get_operand();
     const auto& array_ti = arr_expr->get_type_info();
     CHECK(array_ti.is_array());
     const auto& elem_ti = array_ti.get_elem_type();
     auto array_len =
         (array_ti.get_size() > 0)
             ? cgen_state_->llInt(array_ti.get_size() / elem_ti.get_size())
             : cgen_state_->emitExternalCall(
                   "array_size",
                   ret_ty,
                   {group_key,
                    code_generator.posArg(arr_expr),
                    cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))});
     cgen_state_->ir_builder_.CreateBr(array_loop_head);
     cgen_state_->ir_builder_.SetInsertPoint(array_loop_head);
     CHECK(array_len);
     auto array_idx = cgen_state_->ir_builder_.CreateLoad(
         array_idx_ptr->getType()->getPointerElementType(), array_idx_ptr);
     auto bound_check = cgen_state_->ir_builder_.CreateICmp(
         llvm::ICmpInst::ICMP_SLT, array_idx, array_len);
     auto array_loop_body = llvm::BasicBlock::Create(
         cgen_state_->context_, "array_loop_body", cgen_state_->current_func_);
     cgen_state_->ir_builder_.CreateCondBr(
         bound_check,
         array_loop_body,
         array_loops.empty() ? diamond_codegen.orig_cond_false_ : array_loops.top());
     cgen_state_->ir_builder_.SetInsertPoint(array_loop_body);
     cgen_state_->ir_builder_.CreateStore(
         cgen_state_->ir_builder_.CreateAdd(array_idx, cgen_state_->llInt(int32_t(1))),
         array_idx_ptr);
     auto array_at_fname = "array_at_" + numeric_type_name(elem_ti);
     if (array_ti.get_size() < 0) {
       if (array_ti.get_notnull()) {
         array_at_fname = "notnull_" + array_at_fname;
       }
       array_at_fname = "varlen_" + array_at_fname;
     }
     const auto ar_ret_ty =
         elem_ti.is_fp()
             ? (elem_ti.get_type() == kDOUBLE
                    ? llvm::Type::getDoubleTy(cgen_state_->context_)
                    : llvm::Type::getFloatTy(cgen_state_->context_))
             : get_int_type(elem_ti.get_logical_size() * 8, cgen_state_->context_);
     group_key = cgen_state_->emitExternalCall(
         array_at_fname,
         ar_ret_ty,
         {group_key, code_generator.posArg(arr_expr), array_idx});
     if (need_patch_unnest_double(
             elem_ti, isArchMaxwell(co.device_type), thread_mem_shared)) {
       key_to_cache = spillDoubleElement(group_key, ar_ret_ty);
     } else {
       key_to_cache = group_key;
     }
     CHECK(array_loop_head);
     array_loops.push(array_loop_head);
   }
   cgen_state_->group_by_expr_cache_.push_back(key_to_cache);
   llvm::Value* orig_group_key{nullptr};
   if (translate_null_val) {
     const std::string translator_func_name(
         col_width == sizeof(int32_t) ? "translate_null_key_i32_" : "translate_null_key_");
     const auto& ti = group_by_col->get_type_info();
     const auto key_type = get_int_type(ti.get_logical_size() * 8, cgen_state_->context_);
     orig_group_key = group_key;
     group_key = cgen_state_->emitCall(
         translator_func_name + numeric_type_name(ti),
         {group_key,
          static_cast<llvm::Value*>(
              llvm::ConstantInt::get(key_type, inline_int_null_val(ti))),
          static_cast<llvm::Value*>(llvm::ConstantInt::get(
              llvm::Type::getInt64Ty(cgen_state_->context_), translated_null_val))});
   }
   group_key = cgen_state_->ir_builder_.CreateBitCast(
       cgen_state_->castToTypeIn(group_key, col_width * 8),
       get_int_type(col_width * 8, cgen_state_->context_));
   if (orig_group_key) {
     orig_group_key = cgen_state_->ir_builder_.CreateBitCast(
         cgen_state_->castToTypeIn(orig_group_key, col_width * 8),
         get_int_type(col_width * 8, cgen_state_->context_));
   }
   return {group_key, orig_group_key};
 }

Here is the call graph for this function:

bool Executor::has_extension_module ( ExtModuleKinds kind ) const

inlineprivate

Definition at line 1513 of file Execute.h.

References extension_modules_.

Referenced by has_geos_module(), has_libdevice_module(), has_rt_module(), has_rt_udf_module(), and has_udf_module().

                                                        {
     return extension_modules_.find(kind) != extension_modules_.end();
   }

Here is the caller graph for this function:

bool Executor::has_geos_module ( ) const

inline

Definition at line 563 of file Execute.h.

References has_extension_module(), and rt_geos_module.

                                {
     return has_extension_module(ExtModuleKinds::rt_geos_module);
   }

Here is the call graph for this function:

bool Executor::has_libdevice_module ( ) const

inline

Definition at line 566 of file Execute.h.

References has_extension_module(), and rt_libdevice_module.

                                     {
     return has_extension_module(ExtModuleKinds::rt_libdevice_module);
   }

Here is the call graph for this function:

bool Executor::has_rt_module ( ) const

inline

Definition at line 552 of file Execute.h.

References has_extension_module(), and template_module.

                              {
     return has_extension_module(ExtModuleKinds::template_module);
   }

Here is the call graph for this function:

bool Executor::has_rt_udf_module ( bool is_gpu = false ) const

inline

Definition at line 559 of file Execute.h.

References has_extension_module(), rt_udf_cpu_module, and rt_udf_gpu_module.

                                                     {
     return has_extension_module(
         (is_gpu ? ExtModuleKinds::rt_udf_gpu_module : ExtModuleKinds::rt_udf_cpu_module));
   }

Here is the call graph for this function:

bool Executor::has_udf_module ( bool is_gpu = false ) const

inline

Definition at line 555 of file Execute.h.

References has_extension_module(), udf_cpu_module, and udf_gpu_module.

                                                  {
     return has_extension_module(
         (is_gpu ? ExtModuleKinds::udf_gpu_module : ExtModuleKinds::udf_cpu_module));
   }

Here is the call graph for this function:

bool Executor::hasLazyFetchColumns ( const std::vector< Analyzer::Expr * > & target_exprs ) const

Definition at line 1006 of file Execute.cpp.

References CHECK, and plan_state_.

                                                         {
   CHECK(plan_state_);
   for (const auto target_expr : target_exprs) {
     if (plan_state_->isLazyFetchColumn(target_expr)) {
       return true;
     }
   }
   return false;
 }

void Executor::init_resource_mgr	(	const size_t	num_cpu_slots,
		const size_t	num_gpu_slots,
		const size_t	cpu_result_mem,
		const size_t	cpu_buffer_pool_mem,
		const size_t	gpu_buffer_pool_mem,
		const double	per_query_max_cpu_slots_ratio,
		const double	per_query_max_cpu_result_mem_ratio,
		const bool	allow_cpu_kernel_concurrency,
		const bool	allow_cpu_gpu_kernel_concurrency,
		const bool	allow_cpu_slot_oversubscription_concurrency,
		const bool	allow_cpu_result_mem_oversubscription,
		const double	max_available_resource_use_ratio
	)

static

Definition at line 5387 of file Execute.cpp.

References executor_resource_mgr_, and ExecutorResourceMgr_Namespace::generate_executor_resource_mgr().

Referenced by DBHandler::init_executor_resource_mgr(), and QueryRunner::QueryRunner::QueryRunner().

                                                    {
   const double per_query_max_pinned_cpu_buffer_pool_mem_ratio{1.0};
   const double per_query_max_pageable_cpu_buffer_pool_mem_ratio{0.5};
   executor_resource_mgr_ = ExecutorResourceMgr_Namespace::generate_executor_resource_mgr(
       num_cpu_slots,
       num_gpu_slots,
       cpu_result_mem,
       cpu_buffer_pool_mem,
       gpu_buffer_pool_mem,
       per_query_max_cpu_slots_ratio,
       per_query_max_cpu_result_mem_ratio,
       per_query_max_pinned_cpu_buffer_pool_mem_ratio,
       per_query_max_pageable_cpu_buffer_pool_mem_ratio,
       allow_cpu_kernel_concurrency,
       allow_cpu_gpu_kernel_concurrency,
       allow_cpu_slot_oversubscription_concurrency,
       true,  // allow_gpu_slot_oversubscription
       allow_cpu_result_mem_oversubscription_concurrency,
       max_available_resource_use_ratio);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::initialize_extension_module_sources ( )

static

Definition at line 298 of file Execute.cpp.

References CHECK, extension_module_sources, get_cuda_libdevice_dir(), heavyai::get_root_abs_path(), LOG, rt_geos_module, rt_libdevice_module, template_module, and logger::WARNING.

Referenced by input_table_info_cache_().

                                                    {
   if (Executor::extension_module_sources.find(
           Executor::ExtModuleKinds::template_module) ==
       Executor::extension_module_sources.end()) {
     auto root_path = heavyai::get_root_abs_path();
     auto template_path = root_path + "/QueryEngine/RuntimeFunctions.bc";
     CHECK(boost::filesystem::exists(template_path));
     Executor::extension_module_sources[Executor::ExtModuleKinds::template_module] =
         template_path;
 #ifdef ENABLE_GEOS
     auto rt_geos_path = root_path + "/QueryEngine/GeosRuntime.bc";
     CHECK(boost::filesystem::exists(rt_geos_path));
     Executor::extension_module_sources[Executor::ExtModuleKinds::rt_geos_module] =
         rt_geos_path;
 #endif
 #ifdef HAVE_CUDA
     auto rt_libdevice_path = get_cuda_libdevice_dir() + "/libdevice.10.bc";
     if (boost::filesystem::exists(rt_libdevice_path)) {
       Executor::extension_module_sources[Executor::ExtModuleKinds::rt_libdevice_module] =
           rt_libdevice_path;
     } else {
       LOG(WARNING) << "File " << rt_libdevice_path
                    << " does not exist; support for some UDF "
                       "functions might not be available.";
     }
 #endif
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::initializeNVPTXBackend ( ) const

private

Definition at line 1545 of file NativeCodegen.cpp.

                                             {
   if (nvptx_target_machine_) {
     return;
   }
   const auto arch = cudaMgr()->getDeviceArch();
   nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
 }

std::vector< llvm::Value * > Executor::inlineHoistedLiterals ( )

private

Definition at line 2373 of file NativeCodegen.cpp.

                                                       {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
 
   std::vector<llvm::Value*> hoisted_literals;
 
   // row_func_ is using literals whose defs have been hoisted up to the query_func_,
   // extend row_func_ signature to include extra args to pass these literal values.
   std::vector<llvm::Type*> row_process_arg_types;
 
   for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
                                     E = cgen_state_->row_func_->arg_end();
        I != E;
        ++I) {
     row_process_arg_types.push_back(I->getType());
   }
 
   for (auto& element : cgen_state_->query_func_literal_loads_) {
     for (auto value : element.second) {
       row_process_arg_types.push_back(value->getType());
     }
   }
 
   auto ft = llvm::FunctionType::get(
       get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
   auto row_func_with_hoisted_literals =
       llvm::Function::Create(ft,
                              llvm::Function::ExternalLinkage,
                              "row_func_hoisted_literals",
                              cgen_state_->row_func_->getParent());
 
   auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
   for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
                                     E = cgen_state_->row_func_->arg_end();
        I != E;
        ++I) {
     if (I->hasName()) {
       row_func_arg_it->setName(I->getName());
     }
     ++row_func_arg_it;
   }
 
   decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
   decltype(row_func_arg_it) filter_func_arg_it{nullptr};
   if (cgen_state_->filter_func_) {
     // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
     // extend filter_func_ signature to include extra args to pass these literal values.
     std::vector<llvm::Type*> filter_func_arg_types;
 
     for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
                                       E = cgen_state_->filter_func_->arg_end();
          I != E;
          ++I) {
       filter_func_arg_types.push_back(I->getType());
     }
 
     for (auto& element : cgen_state_->query_func_literal_loads_) {
       for (auto value : element.second) {
         filter_func_arg_types.push_back(value->getType());
       }
     }
 
     auto ft2 = llvm::FunctionType::get(
         get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
     filter_func_with_hoisted_literals =
         llvm::Function::Create(ft2,
                                llvm::Function::ExternalLinkage,
                                "filter_func_hoisted_literals",
                                cgen_state_->filter_func_->getParent());
 
     filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
     for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
                                       E = cgen_state_->filter_func_->arg_end();
          I != E;
          ++I) {
       if (I->hasName()) {
         filter_func_arg_it->setName(I->getName());
       }
       ++filter_func_arg_it;
     }
   }
 
   std::unordered_map<int, std::vector<llvm::Value*>>
       query_func_literal_loads_function_arguments,
       query_func_literal_loads_function_arguments2;
 
   for (auto& element : cgen_state_->query_func_literal_loads_) {
     std::vector<llvm::Value*> argument_values, argument_values2;
 
     for (auto value : element.second) {
       hoisted_literals.push_back(value);
       argument_values.push_back(&*row_func_arg_it);
       if (cgen_state_->filter_func_) {
         argument_values2.push_back(&*filter_func_arg_it);
         cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
       }
       if (value->hasName()) {
         row_func_arg_it->setName("arg_" + value->getName());
         if (cgen_state_->filter_func_) {
           filter_func_arg_it->getContext();
           filter_func_arg_it->setName("arg_" + value->getName());
         }
       }
       ++row_func_arg_it;
       ++filter_func_arg_it;
     }
 
     query_func_literal_loads_function_arguments[element.first] = argument_values;
     query_func_literal_loads_function_arguments2[element.first] = argument_values2;
   }
 
   // copy the row_func function body over
   // see
   // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
   row_func_with_hoisted_literals->getBasicBlockList().splice(
       row_func_with_hoisted_literals->begin(),
       cgen_state_->row_func_->getBasicBlockList());
 
   // also replace row_func arguments with the arguments from row_func_hoisted_literals
   for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
                                     E = cgen_state_->row_func_->arg_end(),
                                     I2 = row_func_with_hoisted_literals->arg_begin();
        I != E;
        ++I) {
     I->replaceAllUsesWith(&*I2);
     I2->takeName(&*I);
     cgen_state_->filter_func_args_.replace(&*I, &*I2);
     ++I2;
   }
 
   cgen_state_->row_func_ = row_func_with_hoisted_literals;
 
   // and finally replace  literal placeholders
   std::vector<llvm::Instruction*> placeholders;
   std::string prefix("__placeholder__literal_");
   for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
             e = llvm::inst_end(row_func_with_hoisted_literals);
        it != e;
        ++it) {
     if (it->hasName() && it->getName().startswith(prefix)) {
       auto offset_and_index_entry =
           cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
       CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
 
       int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
       int lit_idx = offset_and_index_entry->second.index_of_literal_load;
 
       it->replaceAllUsesWith(
           query_func_literal_loads_function_arguments[lit_off][lit_idx]);
       placeholders.push_back(&*it);
     }
   }
   for (auto placeholder : placeholders) {
     placeholder->removeFromParent();
   }
 
   if (cgen_state_->filter_func_) {
     // copy the filter_func function body over
     // see
     // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
     filter_func_with_hoisted_literals->getBasicBlockList().splice(
         filter_func_with_hoisted_literals->begin(),
         cgen_state_->filter_func_->getBasicBlockList());
 
     // also replace filter_func arguments with the arguments from
     // filter_func_hoisted_literals
     for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
                                       E = cgen_state_->filter_func_->arg_end(),
                                       I2 = filter_func_with_hoisted_literals->arg_begin();
          I != E;
          ++I) {
       I->replaceAllUsesWith(&*I2);
       I2->takeName(&*I);
       ++I2;
     }
 
     cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
 
     // and finally replace  literal placeholders
     std::vector<llvm::Instruction*> placeholders;
     std::string prefix("__placeholder__literal_");
     for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
               e = llvm::inst_end(filter_func_with_hoisted_literals);
          it != e;
          ++it) {
       if (it->hasName() && it->getName().startswith(prefix)) {
         auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
             llvm::dyn_cast<llvm::Value>(&*it));
         CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
 
         int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
         int lit_idx = offset_and_index_entry->second.index_of_literal_load;
 
         it->replaceAllUsesWith(
             query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
         placeholders.push_back(&*it);
       }
     }
     for (auto placeholder : placeholders) {
       placeholder->removeFromParent();
     }
   }
 
   return hoisted_literals;
 }

void Executor::insertErrorCodeChecker	(	llvm::Function *	query_func,
		unsigned const	error_code_idx,
		bool	hoist_literals,
		bool	allow_runtime_query_interrupt
	)

private

Definition at line 3242 of file NativeCodegen.cpp.

                                                                           {
   auto query_stub_func_name =
       "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
   for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
     for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
       if (!llvm::isa<llvm::CallInst>(*inst_it)) {
         continue;
       }
       auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
       auto const row_func_name = CodegenUtil::getCalledFunctionName(row_func_call);
       if (row_func_name && *row_func_name == query_stub_func_name) {
         auto next_inst_it = inst_it;
         ++next_inst_it;
         auto new_bb = bb_it->splitBasicBlock(next_inst_it);
         auto& br_instr = bb_it->back();
         llvm::IRBuilder<> ir_builder(&br_instr);
         llvm::Value* err_lv = &*inst_it;
         auto error_check_bb =
             bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
         // query_func does not have parameter names assigned.
         llvm::Value* const error_code_arg = get_arg_by_index(query_func, error_code_idx);
         CHECK(error_code_arg) << error_code_idx << '/' << query_func->arg_size();
         llvm::Value* err_code = nullptr;
         if (allow_runtime_query_interrupt) {
           // decide the final error code with a consideration of interrupt status
           auto& check_interrupt_br_instr = bb_it->back();
           auto interrupt_check_bb = llvm::BasicBlock::Create(
               cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
           llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
           auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
               cgen_state_->module_->getFunction("check_interrupt"), {});
           auto detected_error = interrupt_checker_ir_builder.CreateCall(
               cgen_state_->module_->getFunction("get_error_code"),
               std::vector<llvm::Value*>{error_code_arg});
           err_code = interrupt_checker_ir_builder.CreateSelect(
               detected_interrupt,
               cgen_state_->llInt(int32_t(ErrorCode::INTERRUPTED)),
               detected_error);
           interrupt_checker_ir_builder.CreateBr(error_check_bb);
           llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
                                     llvm::BranchInst::Create(interrupt_check_bb));
           ir_builder.SetInsertPoint(&br_instr);
         } else {
           // uses error code returned from row_func and skip to check interrupt status
           ir_builder.SetInsertPoint(&br_instr);
           err_code =
               ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
                                     std::vector<llvm::Value*>{error_code_arg});
         }
         err_lv = ir_builder.CreateICmp(
             llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
         auto error_bb = llvm::BasicBlock::Create(
             cgen_state_->context_, ".error_exit", query_func, new_bb);
         llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
                                std::vector<llvm::Value*>{err_code, error_code_arg},
                                "",
                                error_bb);
         llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
         llvm::ReplaceInstWithInst(&br_instr,
                                   llvm::BranchInst::Create(error_bb, new_bb, err_lv));
         break;
       }
     }
   }
 }

void Executor::interrupt	(	const QuerySessionId &	query_session = `""`,
		const QuerySessionId &	interrupt_session = `""`
	)

Definition at line 42 of file GpuInterrupt.cpp.

References CHECK, CHECK_EQ, CHECK_GE, check_interrupt_init(), checkCudaErrors(), data_mgr_(), DW_ABORT, dw_abort, dynamic_watchdog_init(), g_enable_dynamic_watchdog, g_enable_non_kernel_time_query_interrupt, g_enable_runtime_query_interrupt, INT_ABORT, runtime_interrupt_flag, to_string(), and VLOG.

                                                              {
   const auto allow_interrupt =
       g_enable_runtime_query_interrupt || g_enable_non_kernel_time_query_interrupt;
   if (allow_interrupt) {
     bool is_running_query = false;
     {
       // here we validate the requested query session is valid (is already enrolled)
       // if not, we skip the interrupt request
       heavyai::shared_lock<heavyai::shared_mutex> session_read_lock(
           executor_session_mutex_);
       if (!checkIsQuerySessionEnrolled(query_session, session_read_lock)) {
         VLOG(1) << "Skip the interrupt request (no query has been submitted from the "
                    "given query session)";
         return;
       }
       if (checkIsQuerySessionInterrupted(query_session, session_read_lock)) {
         VLOG(1) << "Skip the interrupt request (already interrupted query session)";
         return;
       }
       // if a query is pending query, we just need to turn interrupt flag for the session
       // on (not sending interrupt signal to "RUNNING" kernel, see the below code)
       is_running_query = checkCurrentQuerySession(query_session, session_read_lock);
     }
     {
       // We have to cover interrupt request from *any* session because we don't know
       // whether the request is for the running query or pending query
       // or for non-kernel time interrupt
       // (or just false alarm that indicates unregistered session in a queue).
       // So we try to set a session has been interrupted once we confirm
       // the session has been enrolled and is not interrupted at this moment
       heavyai::unique_lock<heavyai::shared_mutex> session_write_lock(
           executor_session_mutex_);
       setQuerySessionAsInterrupted(query_session, session_write_lock);
     }
     if (!is_running_query) {
       return;
     }
     // mark the interrupted status of this executor
     interrupted_.store(true);
   }
 
   // for both GPU and CPU kernel execution, interrupt flag that running kernel accesses
   // is a global variable from a view of Executors
   // but it's okay for now since we hold a kernel_lock when starting the query execution
   // this indicates we should revisit this logic when starting to use multi-query
   // execution for supporting per-kernel interrupt
   bool CPU_execution_mode = true;
 
 #ifdef HAVE_CUDA
   // The below code is basically for runtime query interrupt for GPU.
   // It is also possible that user forces to use CPU-mode even if the user has GPU(s).
   // In this case, we should not execute the code in below to avoid runtime failure
   CHECK(data_mgr_);
   auto cuda_mgr = data_mgr_->getCudaMgr();
   if (cuda_mgr && (g_enable_dynamic_watchdog || allow_interrupt)) {
     // we additionally allow sending interrupt signal for
     // `g_enable_non_kernel_time_query_interrupt` especially for CTAS/ITAS queries: data
     // population happens on CPU but select_query can be processed via GPU
     CHECK_GE(cuda_mgr->getDeviceCount(), 1);
     std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
     CUcontext old_cu_context;
     checkCudaErrors(cuCtxGetCurrent(&old_cu_context));
     for (int device_id = 0; device_id < max_gpu_count; device_id++) {
       if (gpu_active_modules_device_mask_ & (1 << device_id)) {
         void* llvm_module = gpu_active_modules_[device_id];
         auto cu_module = static_cast<CUmodule>(llvm_module);
         if (!cu_module) {
           continue;
         } else {
           VLOG(1) << "Try to interrupt the running query on GPU assigned to Executor "
                   << executor_id_;
           CPU_execution_mode = false;
         }
         cuda_mgr->setContext(device_id);
 
         // Create high priority non-blocking communication stream
         CUstream cu_stream1;
         checkCudaErrors(
             cuStreamCreateWithPriority(&cu_stream1, CU_STREAM_NON_BLOCKING, 1));
 
         CUevent start, stop;
         cuEventCreate(&start, 0);
         cuEventCreate(&stop, 0);
         cuEventRecord(start, cu_stream1);
 
         if (g_enable_dynamic_watchdog) {
           CUdeviceptr dw_abort;
           size_t dw_abort_size;
           if (cuModuleGetGlobal(&dw_abort, &dw_abort_size, cu_module, "dw_abort") ==
               CUDA_SUCCESS) {
             CHECK_EQ(dw_abort_size, sizeof(uint32_t));
             int32_t abort_val = 1;
             checkCudaErrors(cuMemcpyHtoDAsync(dw_abort,
                                               reinterpret_cast<void*>(&abort_val),
                                               sizeof(int32_t),
                                               cu_stream1));
 
             if (device_id == 0) {
               VLOG(1) << "GPU: Async Abort submitted to Device "
                       << std::to_string(device_id);
             }
           }
         }
 
         if (allow_interrupt) {
           CUdeviceptr runtime_interrupt_flag;
           size_t runtime_interrupt_flag_size;
           auto status = cuModuleGetGlobal(&runtime_interrupt_flag,
                                           &runtime_interrupt_flag_size,
                                           cu_module,
                                           "runtime_interrupt_flag");
           if (status == CUDA_SUCCESS) {
             VLOG(1) << "Executor " << executor_id_
                     << " retrieves interrupt status from GPU " << device_id;
             CHECK_EQ(runtime_interrupt_flag_size, sizeof(uint32_t));
             int32_t abort_val = 1;
             checkCudaErrors(cuMemcpyHtoDAsync(runtime_interrupt_flag,
                                               reinterpret_cast<void*>(&abort_val),
                                               sizeof(int32_t),
                                               cu_stream1));
             if (device_id == 0) {
               VLOG(1) << "GPU: send interrupt signal from Executor " << executor_id_
                       << " to Device " << std::to_string(device_id);
             }
           } else if (status == CUDA_ERROR_NOT_FOUND) {
             std::runtime_error(
                 "Runtime query interrupt on Executor " + std::to_string(executor_id_) +
                 " has failed: an interrupt flag on the GPU could "
                 "not be initialized (CUDA_ERROR_CODE: CUDA_ERROR_NOT_FOUND)");
           } else {
             // if we reach here, query runtime interrupt is failed due to
             // one of the following error: CUDA_ERROR_NOT_INITIALIZED,
             // CUDA_ERROR_DEINITIALIZED. CUDA_ERROR_INVALID_CONTEXT, and
             // CUDA_ERROR_INVALID_VALUE. All those error codes are due to device failure.
             const char* error_ret_str = nullptr;
             cuGetErrorName(status, &error_ret_str);
             if (!error_ret_str) {
               error_ret_str = "UNKNOWN";
             }
             std::string error_str(error_ret_str);
             std::runtime_error(
                 "Runtime interrupt on Executor " + std::to_string(executor_id_) +
                 " has failed due to a device " + std::to_string(device_id) +
                 "'s issue "
                 "(CUDA_ERROR_CODE: " +
                 error_str + ")");
           }
 
           cuEventRecord(stop, cu_stream1);
           cuEventSynchronize(stop);
           float milliseconds = 0;
           cuEventElapsedTime(&milliseconds, start, stop);
           VLOG(1) << "Device " << std::to_string(device_id)
                   << ": submitted async interrupt request from Executor " << executor_id_
                   << " : SUCCESS: " << std::to_string(milliseconds) << " ms";
           checkCudaErrors(cuStreamDestroy(cu_stream1));
         }
       }
       checkCudaErrors(cuCtxSetCurrent(old_cu_context));
     }
   }
 #endif
   if (g_enable_dynamic_watchdog) {
     dynamic_watchdog_init(static_cast<unsigned>(DW_ABORT));
   }
 
   if (allow_interrupt && CPU_execution_mode) {
     // turn interrupt flag on for CPU mode
     VLOG(1) << "Try to interrupt the running query on CPU from Executor " << executor_id_;
     check_interrupt_init(static_cast<unsigned>(INT_ABORT));
   }
 }

Here is the call graph for this function:

void Executor::invalidateCardinalityCacheForTable ( const shared::TableKey & table_key )

static

Definition at line 5316 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, and recycler_mutex_.

Referenced by clearExternalCaches().

                                                                                  {
   if (g_use_estimator_result_cache) {
     heavyai::unique_lock<heavyai::shared_mutex> lock(recycler_mutex_);
     for (auto it = cardinality_cache_.begin(); it != cardinality_cache_.end();) {
       if (it->first.containsTableKey(table_key)) {
         it = cardinality_cache_.erase(it);
       } else {
         it++;
       }
     }
   }
 }

Here is the caller graph for this function:

void Executor::invalidateRunningQuerySession ( heavyai::unique_lock< heavyai::shared_mutex > & write_lock )

Definition at line 5013 of file Execute.cpp.

References current_query_session_.

Referenced by clearQuerySessionStatus().

                                                          {
   current_query_session_ = "";
 }

Here is the caller graph for this function:

bool Executor::isArchMaxwell ( const ExecutorDeviceType dt ) const

Definition at line 25 of file MaxwellCodegenPatch.cpp.

References GPU.

                                                               {
   return dt == ExecutorDeviceType::GPU && cudaMgr()->isArchMaxwell();
 }

bool Executor::isArchPascalOrLater ( const ExecutorDeviceType dt ) const

inlineprivate

Definition at line 872 of file Execute.h.

References cudaMgr(), GPU, and CudaMgr_Namespace::CudaMgr::isArchPascalOrLater().

Referenced by getDeviceTypeForTargets().

                                                               {
     if (dt == ExecutorDeviceType::GPU) {
       return cudaMgr()->isArchPascalOrLater();
     }
     return false;
   }

Here is the call graph for this function:

Here is the caller graph for this function:

bool Executor::isCPUOnly ( ) const

Definition at line 706 of file Execute.cpp.

References CHECK, data_mgr_, and Data_Namespace::DataMgr::getCudaMgr().

                                {
   CHECK(data_mgr_);
   return !data_mgr_->getCudaMgr();
 }

Here is the call graph for this function:

bool Executor::isFragmentFullyDeleted	(	const InputDescriptor &	table_desc,
		const Fragmenter_Namespace::FragmentInfo &	fragment
	)

private

Definition at line 4561 of file Execute.cpp.

References CHECK, extract_max_stat_int_type(), extract_min_stat_int_type(), Catalog_Namespace::SysCatalog::getCatalog(), Fragmenter_Namespace::FragmentInfo::getChunkMetadataMap(), InputDescriptor::getTableKey(), Catalog_Namespace::SysCatalog::instance(), and Fragmenter_Namespace::FragmentInfo::physicalTableId.

Referenced by skipFragment().

                                                       {
   // Skip temporary tables
   const auto& table_key = table_desc.getTableKey();
   if (table_key.table_id < 0) {
     return false;
   }
 
   const auto catalog =
       Catalog_Namespace::SysCatalog::instance().getCatalog(table_key.db_id);
   CHECK(catalog);
   const auto td = catalog->getMetadataForTable(fragment.physicalTableId);
   CHECK(td);
   const auto deleted_cd = catalog->getDeletedColumnIfRowsDeleted(td);
   if (!deleted_cd) {
     return false;
   }
 
   const auto& chunk_type = deleted_cd->columnType;
   CHECK(chunk_type.is_boolean());
 
   const auto deleted_col_id = deleted_cd->columnId;
   auto chunk_meta_it = fragment.getChunkMetadataMap().find(deleted_col_id);
   if (chunk_meta_it != fragment.getChunkMetadataMap().end()) {
     const int64_t chunk_min =
         extract_min_stat_int_type(chunk_meta_it->second->chunkStats, chunk_type);
     const int64_t chunk_max =
         extract_max_stat_int_type(chunk_meta_it->second->chunkStats, chunk_type);
     if (chunk_min == 1 && chunk_max == 1) {  // Delete chunk if metadata says full bytemap
       // is true (signifying all rows deleted)
       return true;
     }
   }
   return false;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::launchKernelsImpl	(	SharedKernelContext &	shared_context,
		std::vector< std::unique_ptr< ExecutionKernel >> &&	kernels,
		const ExecutorDeviceType	device_type,
		const size_t	requested_num_threads
	)

private

Launches execution kernels created by createKernels asynchronously using a thread pool.

Definition at line 3040 of file Execute.cpp.

References SharedKernelContext::addDeviceResults(), auto_num_threads, CHECK, CPU, cpu_threads(), DEBUG_TIMER_NEW_THREAD, RelAlgExecutionUnit::estimator, logger::EXECUTOR, g_enable_cpu_sub_tasks, LOG, threading_std::task_group::run(), SharedKernelContext::setNumAllocatedThreads(), logger::thread_local_ids(), VLOG, and threading_std::task_group::wait().

Referenced by launchKernelsLocked(), and launchKernelsViaResourceMgr().

                                                                      {
 #ifdef HAVE_TBB
   const size_t num_threads =
       requested_num_threads == Executor::auto_num_threads
           ? std::min(kernels.size(), static_cast<size_t>(cpu_threads()))
           : requested_num_threads;
   tbb::task_arena local_arena(num_threads);
 #else
   const size_t num_threads = cpu_threads();
 #endif
   shared_context.setNumAllocatedThreads(num_threads);
   LOG(EXECUTOR) << "Launching query step with " << num_threads << " threads.";
   threading::task_group tg;
   // A hack to have unused unit for results collection.
   const RelAlgExecutionUnit* ra_exe_unit =
       kernels.empty() ? nullptr : &kernels[0]->ra_exe_unit_;
 
 #ifdef HAVE_TBB
   if (g_enable_cpu_sub_tasks && device_type == ExecutorDeviceType::CPU) {
     shared_context.setThreadPool(&tg);
   }
   ScopeGuard pool_guard([&shared_context]() { shared_context.setThreadPool(nullptr); });
 #endif  // HAVE_TBB
 
   VLOG(1) << "Launching " << kernels.size() << " kernels for query on "
           << (device_type == ExecutorDeviceType::CPU ? "CPU"s : "GPU"s)
           << " using pool of " << num_threads << " threads.";
   size_t kernel_idx = 1;
 
   for (auto& kernel : kernels) {
     CHECK(kernel.get());
 #ifdef HAVE_TBB
     local_arena.execute([&] {
 #endif
       tg.run([this,
               &kernel,
               &shared_context,
               parent_thread_local_ids = logger::thread_local_ids(),
               num_threads,
               crt_kernel_idx = kernel_idx++] {
         logger::LocalIdsScopeGuard lisg = parent_thread_local_ids.setNewThreadId();
         DEBUG_TIMER_NEW_THREAD(parent_thread_local_ids.thread_id_);
         // Keep monotonicity of thread_idx by kernel launch time, so that optimizations
         // such as launching kernels with data already in pool first become possible
 #ifdef HAVE_TBB
         const size_t old_thread_idx = crt_kernel_idx % num_threads;
         const size_t thread_idx = tbb::this_task_arena::current_thread_index();
         LOG(EXECUTOR) << "Thread idx: " << thread_idx
                       << " Old thread idx: " << old_thread_idx;
 #else
       const size_t thread_idx = crt_kernel_idx % num_threads;
 #endif
         kernel->run(this, thread_idx, shared_context);
       });
 #ifdef HAVE_TBB
     });  // local_arena.execute[&]
 #endif
   }
 #ifdef HAVE_TBB
   local_arena.execute([&] { tg.wait(); });
 #else
   tg.wait();
 #endif
 
   for (auto& exec_ctx : shared_context.getTlsExecutionContext()) {
     // The first arg is used for GPU only, it's not our case.
     // TODO: add QueryExecutionContext::getRowSet() interface
     // for our case.
     if (exec_ctx) {
       ResultSetPtr results;
       if (ra_exe_unit->estimator) {
         results = std::shared_ptr<ResultSet>(exec_ctx->estimator_result_set_.release());
       } else {
         results = exec_ctx->getRowSet(*ra_exe_unit, exec_ctx->query_mem_desc_);
       }
       shared_context.addDeviceResults(std::move(results), {});
     }
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::launchKernelsLocked	(	SharedKernelContext &	shared_context,
		std::vector< std::unique_ptr< ExecutionKernel >> &&	kernels,
		const ExecutorDeviceType	device_type
	)

private

Definition at line 3123 of file Execute.cpp.

References auto_num_threads, kernel_mutex_, kernel_queue_time_ms_, launchKernelsImpl(), timer_start(), and timer_stop().

Referenced by executeWorkUnitImpl().

                                           {
   auto clock_begin = timer_start();
   std::lock_guard<std::mutex> kernel_lock(kernel_mutex_);
   kernel_queue_time_ms_ += timer_stop(clock_begin);
 
   launchKernelsImpl(
       shared_context, std::move(kernels), device_type, Executor::auto_num_threads);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::launchKernelsViaResourceMgr	(	SharedKernelContext &	shared_context,
		std::vector< std::unique_ptr< ExecutionKernel >> &&	kernels,
		const ExecutorDeviceType	device_type,
		const std::vector< InputDescriptor > &	input_descs,
		const QueryMemoryDescriptor &	query_mem_desc
	)

private

Launches a vector of kernels for a given query step, gated/scheduled by ExecutorResourceMgr.

This function first calculates the neccessary CPU, GPU, result set memory and buffer pool memory neccessary for the query, which it then requests from ExecutorResourceMgr. The query thread will be conditionally put into a wait state until there are enough resources to execute the query, which might or might not be concurrently with other query steps, depending on the resource grant policies in place and the resources needed by this thread's query step and all other in-flight queries requesting resources. After the thread is given the green light by ExecutorResourceMgr, it then calls launchKernelsImpl which does the actual work of launching the kernels.

Parameters

shared_context	- used to obtain InputTableInfo vector (query_infos) used for input chunk calculation
kernels	- vector of kernels that will be launched, one per fragment for CPU execution, but can be multi-fragment (one per device) for GPU execution
device_type	- specifies whether the query step should run on CPU or GPU
input_descs	- neccessary to get the input table and column ids for a query for input chunk calculation
query_mem_desc	- neccessary to get result set size per kernel

Definition at line 3135 of file Execute.cpp.

References ExecutorResourceMgr_Namespace::CPU_SLOTS, executor_resource_mgr_, QueryMemoryDescriptor::getBufferSizeBytes(), getChunkRequestInfo(), getExecutorId(), SharedKernelContext::getQueryInfos(), GPU, ExecutorResourceMgr_Namespace::GPU_SLOTS, kernel_queue_time_ms_, launchKernelsImpl(), query_mem_desc, QueryMemoryDescriptor::threadsCanReuseGroupByBuffers(), timer_start(), timer_stop(), and VLOG.

Referenced by executeWorkUnitImpl().

                                                  {
   // CPU queries in general, plus some GPU queries, i.e. certain types of top-k sorts,
   // can generate more kernels than cores/GPU devices, so allow handle this for now
   // by capping the number of requested slots from GPU than actual GPUs
   const size_t num_kernels = kernels.size();
   constexpr bool cap_slots = false;
   const size_t num_compute_slots =
       cap_slots
           ? std::min(num_kernels,
                      executor_resource_mgr_
                          ->get_resource_info(
                              device_type == ExecutorDeviceType::GPU
                                  ? ExecutorResourceMgr_Namespace::ResourceType::GPU_SLOTS
                                  : ExecutorResourceMgr_Namespace::ResourceType::CPU_SLOTS)
                          .second)
           : num_kernels;
   const size_t cpu_result_mem_bytes_per_kernel =
       query_mem_desc.getBufferSizeBytes(device_type);
 
   std::vector<std::pair<int32_t, FragmentsList>> kernel_fragments_list;
   kernel_fragments_list.reserve(num_kernels);
   for (auto& kernel : kernels) {
     const auto device_id = kernel->get_chosen_device_id();
     const auto frag_list = kernel->get_fragment_list();
     if (!frag_list.empty()) {
       kernel_fragments_list.emplace_back(std::make_pair(device_id, frag_list));
     }
   }
   const auto chunk_request_info = getChunkRequestInfo(
       device_type, input_descs, shared_context.getQueryInfos(), kernel_fragments_list);
 
   auto gen_resource_request_info = [device_type,
                                     num_compute_slots,
                                     cpu_result_mem_bytes_per_kernel,
                                     &chunk_request_info,
                                     &query_mem_desc]() {
     if (device_type == ExecutorDeviceType::GPU) {
       return ExecutorResourceMgr_Namespace::RequestInfo(
           device_type,
           static_cast<size_t>(0),                               // priority_level
           static_cast<size_t>(0),                               // cpu_slots
           static_cast<size_t>(0),                               // min_cpu_slots,
           num_compute_slots,                                    // gpu_slots
           num_compute_slots,                                    // min_gpu_slots
           cpu_result_mem_bytes_per_kernel * num_compute_slots,  // cpu_result_mem,
           cpu_result_mem_bytes_per_kernel * num_compute_slots,  // min_cpu_result_mem,
           chunk_request_info,                                   // chunks needed
           false);  // output_buffers_reusable_intra_thrad
     } else {
       const size_t min_cpu_slots{1};
       const size_t min_cpu_result_mem =
           query_mem_desc.threadsCanReuseGroupByBuffers()
               ? cpu_result_mem_bytes_per_kernel * min_cpu_slots
               : cpu_result_mem_bytes_per_kernel * num_compute_slots;
       return ExecutorResourceMgr_Namespace::RequestInfo(
           device_type,
           static_cast<size_t>(0),                               // priority_level
           num_compute_slots,                                    // cpu_slots
           min_cpu_slots,                                        // min_cpu_slots
           size_t(0),                                            // gpu_slots
           size_t(0),                                            // min_gpu_slots
           cpu_result_mem_bytes_per_kernel * num_compute_slots,  // cpu_result_mem
           min_cpu_result_mem,                                   // min_cpu_result_mem
           chunk_request_info,                                   // chunks needed
           query_mem_desc
               .threadsCanReuseGroupByBuffers());  // output_buffers_reusable_intra_thread
     }
   };
 
   const auto resource_request_info = gen_resource_request_info();
 
   auto clock_begin = timer_start();
   const bool is_empty_request =
       resource_request_info.cpu_slots == 0UL && resource_request_info.gpu_slots == 0UL;
   auto resource_handle =
       is_empty_request ? nullptr
                        : executor_resource_mgr_->request_resources(resource_request_info);
   const auto num_cpu_threads =
       is_empty_request ? 0UL : resource_handle->get_resource_grant().cpu_slots;
   if (device_type == ExecutorDeviceType::GPU) {
     const auto num_gpu_slots =
         is_empty_request ? 0UL : resource_handle->get_resource_grant().gpu_slots;
     VLOG(1) << "In Executor::LaunchKernels executor " << getExecutorId() << " requested "
             << "between " << resource_request_info.min_gpu_slots << " and "
             << resource_request_info.gpu_slots << " GPU slots, and was granted "
             << num_gpu_slots << " GPU slots.";
   } else {
     VLOG(1) << "In Executor::LaunchKernels executor " << getExecutorId() << " requested "
             << "between " << resource_request_info.min_cpu_slots << " and "
             << resource_request_info.cpu_slots << " CPU slots, and was granted "
             << num_cpu_threads << " CPU slots.";
   }
   kernel_queue_time_ms_ += timer_stop(clock_begin);
   launchKernelsImpl(shared_context, std::move(kernels), device_type, num_cpu_threads);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::logSystemCPUMemoryStatus	(	std::string const &	tag,
		size_t const	thread_idx
	)		const

Definition at line 765 of file Execute.cpp.

References executor_id_, g_allow_memory_status_log, getDataMgr(), Data_Namespace::DataMgr::getSystemMemoryUsage(), anonymous_namespace{Execute.cpp}::log_system_memory_info_impl(), timer_start(), and timer_stop().

                                                                        {
   if (g_allow_memory_status_log && getDataMgr()) {
     auto timer = timer_start();
     std::ostringstream oss;
     oss << getDataMgr()->getSystemMemoryUsage();
     log_system_memory_info_impl(
         oss.str(), executor_id_, timer_stop(timer), log_tag, thread_idx);
   }
 }

Here is the call graph for this function:

void Executor::logSystemGPUMemoryStatus	(	std::string const &	tag,
		size_t const	thread_idx
	)		const

Definition at line 776 of file Execute.cpp.

References executor_id_, g_allow_memory_status_log, Data_Namespace::DataMgr::getCudaMgr(), getDataMgr(), anonymous_namespace{Execute.cpp}::log_system_memory_info_impl(), timer_start(), and timer_stop().

                                                                        {
 #ifdef HAVE_CUDA
   if (g_allow_memory_status_log && getDataMgr() && getDataMgr()->gpusPresent() &&
       getDataMgr()->getCudaMgr()) {
     auto timer = timer_start();
     auto mem_log = getDataMgr()->getCudaMgr()->getCudaMemoryUsageInString();
     log_system_memory_info_impl(
         mem_log, executor_id_, timer_stop(timer), log_tag, thread_idx);
   }
 #endif
 }

Here is the call graph for this function:

size_t Executor::maxGpuSlabSize ( ) const

Definition at line 4392 of file Execute.cpp.

References max_gpu_slab_size_.

                                       {
   return max_gpu_slab_size_;
 }

bool Executor::needFetchAllFragments	(	const InputColDescriptor &	col_desc,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const FragmentsList &	selected_fragments
	)		const

private

Definition at line 3416 of file Execute.cpp.

References CHECK_EQ, CHECK_LT, InputDescriptor::getNestLevel(), InputColDescriptor::getScanDesc(), InputDescriptor::getSourceType(), InputDescriptor::getTableKey(), RelAlgExecutionUnit::input_descs, RelAlgExecutionUnit::join_quals, plan_state_, and TABLE.

Referenced by fetchChunks(), and fetchUnionChunks().

                                                                                     {
   const auto& input_descs = ra_exe_unit.input_descs;
   const int nest_level = inner_col_desc.getScanDesc().getNestLevel();
   if (nest_level < 1 ||
       inner_col_desc.getScanDesc().getSourceType() != InputSourceType::TABLE ||
       ra_exe_unit.join_quals.empty() || input_descs.size() < 2 ||
       (ra_exe_unit.join_quals.empty() &&
        plan_state_->isLazyFetchColumn(inner_col_desc))) {
     return false;
   }
   const auto& table_key = inner_col_desc.getScanDesc().getTableKey();
   CHECK_LT(static_cast<size_t>(nest_level), selected_fragments.size());
   CHECK_EQ(table_key, selected_fragments[nest_level].table_key);
   const auto& fragments = selected_fragments[nest_level].fragment_ids;
   return fragments.size() > 1;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

bool Executor::needLinearizeAllFragments	(	const ColumnDescriptor *	cd,
		const InputColDescriptor &	inner_col_desc,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const FragmentsList &	selected_fragments,
		const Data_Namespace::MemoryLevel	memory_level
	)		const

private

Definition at line 3435 of file Execute.cpp.

References CHECK_EQ, CHECK_LT, ColumnDescriptor::columnType, InputDescriptor::getNestLevel(), InputColDescriptor::getScanDesc(), InputDescriptor::getTableKey(), SQLTypeInfo::is_array(), SQLTypeInfo::is_dict_encoded_type(), and SQLTypeInfo::is_string().

Referenced by fetchChunks().

                                                         {
   const int nest_level = inner_col_desc.getScanDesc().getNestLevel();
   const auto& table_key = inner_col_desc.getScanDesc().getTableKey();
   CHECK_LT(static_cast<size_t>(nest_level), selected_fragments.size());
   CHECK_EQ(table_key, selected_fragments[nest_level].table_key);
   const auto& fragments = selected_fragments[nest_level].fragment_ids;
   auto need_linearize =
       cd->columnType.is_array() ||
       (cd->columnType.is_string() && !cd->columnType.is_dict_encoded_type());
   return table_key.table_id > 0 && need_linearize && fragments.size() > 1;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void Executor::nukeCacheOfExecutors ( )

inlinestatic

Definition at line 505 of file Execute.h.

References execute_mutex_, executors_, and executors_cache_mutex_.

                                      {
     heavyai::unique_lock<heavyai::shared_mutex> flush_lock(
         execute_mutex_);  // don't want native code to vanish while executing
     heavyai::unique_lock<heavyai::shared_mutex> lock(executors_cache_mutex_);
     executors_.clear();
   }

void Executor::nukeOldState	(	const bool	allow_lazy_fetch,
		const std::vector< InputTableInfo > &	query_infos,
		const PlanState::DeletedColumnsMap &	deleted_cols_map,
		const RelAlgExecutionUnit *	ra_exe_unit
	)

private

Definition at line 4268 of file Execute.cpp.

References cgen_state_, compilation_queue_time_ms_, RelAlgExecutionUnit::join_quals, kernel_queue_time_ms_, LEFT, and plan_state_.

                                                                     {
   kernel_queue_time_ms_ = 0;
   compilation_queue_time_ms_ = 0;
   const bool contains_left_deep_outer_join =
       ra_exe_unit && std::find_if(ra_exe_unit->join_quals.begin(),
                                   ra_exe_unit->join_quals.end(),
                                   [](const JoinCondition& join_condition) {
                                     return join_condition.type == JoinType::LEFT;
                                   }) != ra_exe_unit->join_quals.end();
   cgen_state_.reset(
       new CgenState(query_infos.size(), contains_left_deep_outer_join, this));
   plan_state_.reset(new PlanState(allow_lazy_fetch && !contains_left_deep_outer_join,
                                   query_infos,
                                   deleted_cols_map,
                                   this));
 }

unsigned Executor::numBlocksPerMP ( ) const

Definition at line 4361 of file Execute.cpp.

References shared::ceil_div(), cudaMgr(), and grid_size_x_.

                                         {
   return std::max((unsigned)2,
                   shared::ceil_div(grid_size_x_, cudaMgr()->getMinNumMPsForAllDevices()));
 }

Here is the call graph for this function:

std::shared_ptr< CompilationContext > Executor::optimizeAndCodegenCPU	(	llvm::Function *	query_func,
		llvm::Function *	multifrag_query_func,
		const std::unordered_set< llvm::Function * > &	live_funcs,
		const CompilationOptions &	co
	)

private

Definition at line 487 of file NativeCodegen.cpp.

References QueryEngine::getInstance(), logger::INFO, CodeGenerator::link_udf_module(), LOG, serialize_llvm_object(), and to_string().

                                   {
   CodeCacheKey key{serialize_llvm_object(query_func),
                    serialize_llvm_object(cgen_state_->row_func_)};
 
   llvm::Module* M = query_func->getParent();
   auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
       M->getModuleFlag("manage_memory_buffer"));
   if (flag and flag->getZExtValue() == 1 and M->getFunction("allocate_varlen_buffer") and
       M->getFunction("register_buffer_with_executor_rsm")) {
     LOG(INFO) << "including executor addr to cache key\n";
     key.push_back(std::to_string(reinterpret_cast<int64_t>(this)));
   }
   if (cgen_state_->filter_func_) {
     key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
   }
   for (const auto helper : cgen_state_->helper_functions_) {
     key.push_back(serialize_llvm_object(helper));
   }
   auto cached_code = QueryEngine::getInstance()->cpu_code_accessor->get_value(key);
   if (cached_code) {
     return cached_code;
   }
 
   if (cgen_state_->needs_geos_) {
 #ifdef ENABLE_GEOS
     auto llvm_module = multifrag_query_func->getParent();
     load_geos_dynamic_library();
 
     // Read geos runtime module and bind GEOS API function references to GEOS library
     auto rt_geos_module_copy = llvm::CloneModule(
         *get_geos_module(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
           auto func = llvm::dyn_cast<llvm::Function>(gv);
           if (!func) {
             return true;
           }
           switch (func->getLinkage()) {
             case llvm::GlobalValue::LinkageTypes::InternalLinkage:
             case llvm::GlobalValue::LinkageTypes::PrivateLinkage:
             case llvm::GlobalValue::LinkageTypes::ExternalLinkage:
             case llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage:
               return true;
             default:
               return false;
           }
         });
     CodeGenerator::link_udf_module(rt_geos_module_copy,
                                    *llvm_module,
                                    cgen_state_.get(),
                                    llvm::Linker::Flags::LinkOnlyNeeded);
 #else
     throw std::runtime_error("GEOS is disabled in this build");
 #endif
   }
 
   auto execution_engine =
       CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
   auto cpu_compilation_context =
       std::make_shared<CpuCompilationContext>(std::move(execution_engine));
   cpu_compilation_context->setFunctionPointer(multifrag_query_func);
   QueryEngine::getInstance()->cpu_code_accessor->put(key, cpu_compilation_context);
   return std::dynamic_pointer_cast<CompilationContext>(cpu_compilation_context);
 }

Here is the call graph for this function:

std::shared_ptr< CompilationContext > Executor::optimizeAndCodegenGPU	(	llvm::Function *	query_func,
		llvm::Function *	multifrag_query_func,
		std::unordered_set< llvm::Function * > &	live_funcs,
		const bool	no_inline,
		const CudaMgr_Namespace::CudaMgr *	cuda_mgr,
		const bool	is_gpu_smem_used,
		const CompilationOptions &	co
	)

private

Definition at line 1395 of file NativeCodegen.cpp.

                                   {
 #ifdef HAVE_CUDA
   auto timer = DEBUG_TIMER(__func__);
 
   CHECK(cuda_mgr);
   CodeCacheKey key{serialize_llvm_object(query_func),
                    serialize_llvm_object(cgen_state_->row_func_)};
   if (cgen_state_->filter_func_) {
     key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
   }
   for (const auto helper : cgen_state_->helper_functions_) {
     key.push_back(serialize_llvm_object(helper));
   }
   auto cached_code = QueryEngine::getInstance()->gpu_code_accessor->get_value(key);
   if (cached_code) {
     return cached_code;
   }
 
   bool row_func_not_inlined = false;
   if (no_inline) {
     for (auto it = llvm::inst_begin(cgen_state_->row_func_),
               e = llvm::inst_end(cgen_state_->row_func_);
          it != e;
          ++it) {
       if (llvm::isa<llvm::CallInst>(*it)) {
         auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
         auto const func_name = CodegenUtil::getCalledFunctionName(get_gv_call);
         if (func_name &&
             (*func_name == "array_size" || *func_name == "linear_probabilistic_count")) {
           mark_function_never_inline(cgen_state_->row_func_);
           row_func_not_inlined = true;
           break;
         }
       }
     }
   }
 
   initializeNVPTXBackend();
   CodeGenerator::GPUTarget gpu_target{
       nvptx_target_machine_.get(), cuda_mgr, cgen_state_.get(), row_func_not_inlined};
   std::shared_ptr<GpuCompilationContext> compilation_context;
 
   try {
     compilation_context = CodeGenerator::generateNativeGPUCode(this,
                                                                query_func,
                                                                multifrag_query_func,
                                                                live_funcs,
                                                                is_gpu_smem_used,
                                                                co,
                                                                gpu_target);
   } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
     if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
       // Thrown if memory not able to be allocated on gpu
       // Retry once after evicting portion of code cache
       auto& code_cache_accessor = QueryEngine::getInstance()->gpu_code_accessor;
       auto const num_entries_to_evict =
           code_cache_accessor->computeNumEntriesToEvict(g_fraction_code_cache_to_evict);
       code_cache_accessor->evictEntries(num_entries_to_evict);
       compilation_context = CodeGenerator::generateNativeGPUCode(this,
                                                                  query_func,
                                                                  multifrag_query_func,
                                                                  live_funcs,
                                                                  is_gpu_smem_used,
                                                                  co,
                                                                  gpu_target);
     } else {
       throw;
     }
   }
   QueryEngine::getInstance()->gpu_code_accessor->put(key, compilation_context);
   return std::dynamic_pointer_cast<CompilationContext>(compilation_context);
 #else
   return nullptr;
 #endif
 }

void Executor::pause_executor_queue ( )

static

Definition at line 5420 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

Referenced by anonymous_namespace{DBHandler.cpp}::pause_and_resume_executor_queue(), and DBHandler::pause_executor_queue().

                                     {
   if (!g_enable_executor_resource_mgr) {
     throw std::runtime_error(
         "Executor queue cannot be paused as it requires Executor Resource Manager to be "
         "enabled");
   }
   executor_resource_mgr_->pause_process_queue();
 }

Here is the caller graph for this function:

void Executor::preloadFragOffsets	(	const std::vector< InputDescriptor > &	input_descs,
		const std::vector< InputTableInfo > &	query_infos
	)

private

Definition at line 4288 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, cgen_state_, CHECK_LT, and get_arg_by_name().

                                                                                 {
   AUTOMATIC_IR_METADATA(cgen_state_.get());
   const auto ld_count = input_descs.size();
   auto frag_off_ptr = get_arg_by_name(cgen_state_->row_func_, "frag_row_off");
   for (size_t i = 0; i < ld_count; ++i) {
     CHECK_LT(i, query_infos.size());
     const auto frag_count = query_infos[i].info.fragments.size();
     if (i > 0) {
       cgen_state_->frag_offsets_.push_back(nullptr);
     } else {
       if (frag_count > 1) {
         cgen_state_->frag_offsets_.push_back(cgen_state_->ir_builder_.CreateLoad(
             frag_off_ptr->getType()->getPointerElementType(), frag_off_ptr));
       } else {
         cgen_state_->frag_offsets_.push_back(nullptr);
       }
     }
   }
 }

Here is the call graph for this function:

std::vector< llvm::Value * > Executor::prepareRangeModeFuncArgs	(	bool	for_start_bound,
		const Analyzer::WindowFrame *	frame_bound,
		bool	is_timestamp_type_frame,
		llvm::Value *	order_key_null_val,
		const WindowFrameBoundFuncArgs &	frame_args
	)		const

private

Definition at line 875 of file WindowFunctionIR.cpp.

                                                 {
   llvm::Value* bound_expr_lv =
       for_start_bound ? args.frame_start_bound_expr_lv : args.frame_end_bound_expr_lv;
   llvm::Value* target_val_lv =
       frame_bound->isCurrentRowBound() || !is_timestamp_type_frame
           ? args.current_col_value_lv
           : bound_expr_lv;
   llvm::Value* frame_bound_val_lv =
       frame_bound->isCurrentRowBound() || is_timestamp_type_frame
           ? args.int64_t_zero_val_lv
           : bound_expr_lv;
   std::vector<llvm::Value*> frame_args{args.num_elem_current_partition_lv,
                                        target_val_lv,
                                        args.order_key_buf_ptr_lv,
                                        args.target_partition_rowid_ptr_lv,
                                        args.target_partition_sorted_rowid_ptr_lv,
                                        frame_bound_val_lv,
                                        order_key_null_val,
                                        args.nulls_first_lv,
                                        args.null_start_pos_lv,
                                        args.null_end_pos_lv};
   return frame_args;
 }

Here is the call graph for this function:

std::vector< llvm::Value * > Executor::prepareRowModeFuncArgs	(	bool	for_start_bound,
		SqlWindowFrameBoundType	bound_type,
		const WindowFrameBoundFuncArgs &	args
	)		const

private

Definition at line 857 of file WindowFunctionIR.cpp.

References WindowFrameBoundFuncArgs::current_partition_start_offset_lv, CURRENT_ROW, WindowFrameBoundFuncArgs::current_row_pos_lv, EXPR_FOLLOWING, WindowFrameBoundFuncArgs::frame_end_bound_expr_lv, WindowFrameBoundFuncArgs::frame_start_bound_expr_lv, WindowFrameBoundFuncArgs::int64_t_zero_val_lv, and WindowFrameBoundFuncArgs::num_elem_current_partition_lv.

                                                 {
   std::vector<llvm::Value*> frame_args{args.current_row_pos_lv,
                                        args.current_partition_start_offset_lv};
   if (bound_type == SqlWindowFrameBoundType::CURRENT_ROW) {
     frame_args.push_back(args.int64_t_zero_val_lv);
   } else {
     frame_args.push_back(for_start_bound ? args.frame_start_bound_expr_lv
                                          : args.frame_end_bound_expr_lv);
     if (bound_type == SqlWindowFrameBoundType::EXPR_FOLLOWING) {
       frame_args.push_back(args.num_elem_current_partition_lv);
     }
   }
   return frame_args;
 }

void Executor::redeclareFilterFunction ( )

private

Definition at line 1087 of file IRCodegen.cpp.

References CHECK, CHECK_EQ, get_int_type(), and to_string().

                                        {
   if (!cgen_state_->filter_func_) {
     return;
   }
 
   // Loop over all the instructions used in the filter func.
   // The filter func instructions were generated as if for row func.
   // Remap any values used by those instructions to filter func args
   // and remember to forward them through the call in the row func.
   for (auto bb_it = cgen_state_->filter_func_->begin();
        bb_it != cgen_state_->filter_func_->end();
        ++bb_it) {
     for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
       size_t i = 0;
       for (auto op_it = instr_it->value_op_begin(); op_it != instr_it->value_op_end();
            ++op_it, ++i) {
         llvm::Value* v = *op_it;
 
         // The last LLVM operand on a call instruction is the function to be called. Never
         // remap it.
         if (llvm::dyn_cast<const llvm::CallInst>(instr_it) &&
             op_it == instr_it->value_op_end() - 1) {
           continue;
         }
 
         CHECK(v);
         if (auto* instr = llvm::dyn_cast<llvm::Instruction>(v);
             instr && instr->getParent() &&
             instr->getParent()->getParent() == cgen_state_->row_func_) {
           // Remember that this filter func arg is needed.
           cgen_state_->filter_func_args_[v] = nullptr;
         } else if (auto* argum = llvm::dyn_cast<llvm::Argument>(v);
                    argum && argum->getParent() == cgen_state_->row_func_) {
           // Remember that this filter func arg is needed.
           cgen_state_->filter_func_args_[v] = nullptr;
         }
       }
     }
   }
 
   // Create filter_func2 with parameters only for those row func values that are known to
   // be used in the filter func code.
   std::vector<llvm::Type*> filter_func_arg_types;
   filter_func_arg_types.reserve(cgen_state_->filter_func_args_.v_.size());
   for (auto& arg : cgen_state_->filter_func_args_.v_) {
     filter_func_arg_types.push_back(arg->getType());
   }
   auto ft = llvm::FunctionType::get(
       get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
   cgen_state_->filter_func_->setName("old_filter_func");
   auto filter_func2 = llvm::Function::Create(ft,
                                              llvm::Function::ExternalLinkage,
                                              "filter_func",
                                              cgen_state_->filter_func_->getParent());
   CHECK_EQ(filter_func2->arg_size(), cgen_state_->filter_func_args_.v_.size());
   auto arg_it = cgen_state_->filter_func_args_.begin();
   size_t i = 0;
   for (llvm::Function::arg_iterator I = filter_func2->arg_begin(),
                                     E = filter_func2->arg_end();
        I != E;
        ++I, ++arg_it) {
     arg_it->second = &*I;
     if (arg_it->first->hasName()) {
       I->setName(arg_it->first->getName());
     } else {
       I->setName("extra" + std::to_string(i++));
     }
   }
 
   // copy the filter_func function body over
   // see
   // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
   filter_func2->getBasicBlockList().splice(
       filter_func2->begin(), cgen_state_->filter_func_->getBasicBlockList());
 
   if (cgen_state_->current_func_ == cgen_state_->filter_func_) {
     cgen_state_->current_func_ = filter_func2;
   }
   cgen_state_->filter_func_ = filter_func2;
 
   // loop over all the operands in the filter func
   for (auto bb_it = cgen_state_->filter_func_->begin();
        bb_it != cgen_state_->filter_func_->end();
        ++bb_it) {
     for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
       size_t i = 0;
       for (auto op_it = instr_it->op_begin(); op_it != instr_it->op_end(); ++op_it, ++i) {
         llvm::Value* v = op_it->get();
         if (auto arg_it = cgen_state_->filter_func_args_.find(v);
             arg_it != cgen_state_->filter_func_args_.end()) {
           // replace row func value with a filter func arg
           llvm::Use* use = &*op_it;
           use->set(arg_it->second);
         }
       }
     }
   }
 }

Here is the call graph for this function:

ResultSetPtr Executor::reduceMultiDeviceResults	(	const RelAlgExecutionUnit &	ra_exe_unit,
		std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &	all_fragment_results,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		const QueryMemoryDescriptor &	query_mem_desc
	)		const

private

Definition at line 1589 of file Execute.cpp.

References blockSize(), CPU, DEBUG_TIMER, RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), getUniqueThreadSharedResultSets(), gridSize(), QueryMemoryDescriptor, reduce_estimator_results(), reduceMultiDeviceResultSets(), RelAlgExecutionUnit::target_exprs, and QueryMemoryDescriptor::threadsCanReuseGroupByBuffers().

Referenced by collectAllDeviceResults().

                                                        {
   auto timer = DEBUG_TIMER(__func__);
   if (ra_exe_unit.estimator) {
     return reduce_estimator_results(ra_exe_unit, results_per_device);
   }
 
   if (results_per_device.empty()) {
     auto const targets = shared::transform<std::vector<TargetInfo>>(
         ra_exe_unit.target_exprs, GetTargetInfo{});
     return std::make_shared<ResultSet>(targets,
                                        ExecutorDeviceType::CPU,
                                        QueryMemoryDescriptor(),
                                        nullptr,
                                        blockSize(),
                                        gridSize());
   }
 
   if (query_mem_desc.threadsCanReuseGroupByBuffers()) {
     auto unique_results = getUniqueThreadSharedResultSets(results_per_device);
     return reduceMultiDeviceResultSets(
         unique_results,
         row_set_mem_owner,
         ResultSet::fixupQueryMemoryDescriptor(query_mem_desc));
   }
   return reduceMultiDeviceResultSets(
       results_per_device,
       row_set_mem_owner,
       ResultSet::fixupQueryMemoryDescriptor(query_mem_desc));
 }

Here is the call graph for this function:

Here is the caller graph for this function:

ResultSetPtr Executor::reduceMultiDeviceResultSets	(	std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &	all_fragment_results,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		const QueryMemoryDescriptor &	query_mem_desc
	)		const

private

Definition at line 1664 of file Execute.cpp.

References gpu_enabled::accumulate(), blockSize(), CHECK, CPU, DEBUG_TIMER, executor_id_, anonymous_namespace{Execute.cpp}::get_reduction_code(), QueryMemoryDescriptor::getQueryDescriptionType(), gridSize(), heavyai::GroupByBaselineHash, logger::init(), plan_state_, query_mem_desc, and QueryMemoryDescriptor::setEntryCount().

Referenced by reduceMultiDeviceResults().

                                                        {
   auto timer = DEBUG_TIMER(__func__);
   std::shared_ptr<ResultSet> reduced_results;
 
   const auto& first = results_per_device.front().first;
 
   if (query_mem_desc.getQueryDescriptionType() ==
           QueryDescriptionType::GroupByBaselineHash &&
       results_per_device.size() > 1) {
     const auto total_entry_count = std::accumulate(
         results_per_device.begin(),
         results_per_device.end(),
         size_t(0),
         [](const size_t init, const std::pair<ResultSetPtr, std::vector<size_t>>& rs) {
           const auto& r = rs.first;
           return init + r->getQueryMemDesc().getEntryCount();
         });
     CHECK(total_entry_count);
     auto query_mem_desc = first->getQueryMemDesc();
     query_mem_desc.setEntryCount(total_entry_count);
     reduced_results = std::make_shared<ResultSet>(first->getTargetInfos(),
                                                   ExecutorDeviceType::CPU,
                                                   query_mem_desc,
                                                   row_set_mem_owner,
                                                   blockSize(),
                                                   gridSize());
     auto result_storage = reduced_results->allocateStorage(plan_state_->init_agg_vals_);
     reduced_results->initializeStorage();
     switch (query_mem_desc.getEffectiveKeyWidth()) {
       case 4:
         first->getStorage()->moveEntriesToBuffer<int32_t>(
             result_storage->getUnderlyingBuffer(), query_mem_desc.getEntryCount());
         break;
       case 8:
         first->getStorage()->moveEntriesToBuffer<int64_t>(
             result_storage->getUnderlyingBuffer(), query_mem_desc.getEntryCount());
         break;
       default:
         CHECK(false);
     }
   } else {
     reduced_results = first;
   }
 
   int64_t compilation_queue_time = 0;
   const auto reduction_code =
       get_reduction_code(executor_id_, results_per_device, &compilation_queue_time);
 
   for (size_t i = 1; i < results_per_device.size(); ++i) {
     reduced_results->getStorage()->reduce(
         *(results_per_device[i].first->getStorage()), {}, reduction_code, executor_id_);
   }
   reduced_results->addCompilationQueueTime(compilation_queue_time);
   reduced_results->invalidateCachedRowCount();
   return reduced_results;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

std::pair< int64_t, int32_t > Executor::reduceResults	(	const SQLAgg	agg,
		const SQLTypeInfo &	ti,
		const int64_t	agg_init_val,
		const int8_t	out_byte_width,
		const int64_t *	out_vec,
		const size_t	out_vec_sz,
		const bool	is_group_by,
		const bool	float_argument_input
	)

static

Definition at line 1337 of file Execute.cpp.

References agg_max_double_skip_val(), agg_max_float_skip_val(), agg_max_skip_val(), agg_min_double_skip_val(), agg_min_float_skip_val(), agg_min_skip_val(), agg_sum_double_skip_val(), agg_sum_float_skip_val(), agg_sum_skip_val(), CHECK, float_to_double_bin(), SQLTypeInfo::get_notnull(), SQLTypeInfo::is_boolean(), SQLTypeInfo::is_decimal(), SQLTypeInfo::is_fp(), SQLTypeInfo::is_integer(), SQLTypeInfo::is_time(), kAVG, kCOUNT, kCOUNT_IF, kMAX, kMIN, kSAMPLE, kSINGLE_VALUE, kSUM, kSUM_IF, and UNREACHABLE.

Referenced by executePlanWithoutGroupBy().

                                                                                      {
   switch (agg) {
     case kAVG:
     case kSUM:
     case kSUM_IF:
       if (0 != agg_init_val) {
         if (ti.is_integer() || ti.is_decimal() || ti.is_time() || ti.is_boolean()) {
           int64_t agg_result = agg_init_val;
           for (size_t i = 0; i < out_vec_sz; ++i) {
             agg_sum_skip_val(&agg_result, out_vec[i], agg_init_val);
           }
           return {agg_result, 0};
         } else {
           CHECK(ti.is_fp());
           switch (out_byte_width) {
             case 4: {
               int agg_result = static_cast<int32_t>(agg_init_val);
               for (size_t i = 0; i < out_vec_sz; ++i) {
                 agg_sum_float_skip_val(
                     &agg_result,
                     *reinterpret_cast<const float*>(may_alias_ptr(&out_vec[i])),
                     *reinterpret_cast<const float*>(may_alias_ptr(&agg_init_val)));
               }
               const int64_t converted_bin =
                   float_argument_input
                       ? static_cast<int64_t>(agg_result)
                       : float_to_double_bin(static_cast<int32_t>(agg_result), true);
               return {converted_bin, 0};
               break;
             }
             case 8: {
               int64_t agg_result = agg_init_val;
               for (size_t i = 0; i < out_vec_sz; ++i) {
                 agg_sum_double_skip_val(
                     &agg_result,
                     *reinterpret_cast<const double*>(may_alias_ptr(&out_vec[i])),
                     *reinterpret_cast<const double*>(may_alias_ptr(&agg_init_val)));
               }
               return {agg_result, 0};
               break;
             }
             default:
               CHECK(false);
           }
         }
       }
       if (ti.is_integer() || ti.is_decimal() || ti.is_time()) {
         int64_t agg_result = 0;
         for (size_t i = 0; i < out_vec_sz; ++i) {
           agg_result += out_vec[i];
         }
         return {agg_result, 0};
       } else {
         CHECK(ti.is_fp());
         switch (out_byte_width) {
           case 4: {
             float r = 0.;
             for (size_t i = 0; i < out_vec_sz; ++i) {
               r += *reinterpret_cast<const float*>(may_alias_ptr(&out_vec[i]));
             }
             const auto float_bin = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
             const int64_t converted_bin =
                 float_argument_input ? float_bin : float_to_double_bin(float_bin, true);
             return {converted_bin, 0};
           }
           case 8: {
             double r = 0.;
             for (size_t i = 0; i < out_vec_sz; ++i) {
               r += *reinterpret_cast<const double*>(may_alias_ptr(&out_vec[i]));
             }
             return {*reinterpret_cast<const int64_t*>(may_alias_ptr(&r)), 0};
           }
           default:
             CHECK(false);
         }
       }
       break;
     case kCOUNT:
     case kCOUNT_IF: {
       uint64_t agg_result = 0;
       for (size_t i = 0; i < out_vec_sz; ++i) {
         const uint64_t out = static_cast<uint64_t>(out_vec[i]);
         agg_result += out;
       }
       return {static_cast<int64_t>(agg_result), 0};
     }
     case kMIN: {
       if (ti.is_integer() || ti.is_decimal() || ti.is_time() || ti.is_boolean()) {
         int64_t agg_result = agg_init_val;
         for (size_t i = 0; i < out_vec_sz; ++i) {
           agg_min_skip_val(&agg_result, out_vec[i], agg_init_val);
         }
         return {agg_result, 0};
       } else {
         switch (out_byte_width) {
           case 4: {
             int32_t agg_result = static_cast<int32_t>(agg_init_val);
             for (size_t i = 0; i < out_vec_sz; ++i) {
               agg_min_float_skip_val(
                   &agg_result,
                   *reinterpret_cast<const float*>(may_alias_ptr(&out_vec[i])),
                   *reinterpret_cast<const float*>(may_alias_ptr(&agg_init_val)));
             }
             const int64_t converted_bin =
                 float_argument_input
                     ? static_cast<int64_t>(agg_result)
                     : float_to_double_bin(static_cast<int32_t>(agg_result), true);
             return {converted_bin, 0};
           }
           case 8: {
             int64_t agg_result = agg_init_val;
             for (size_t i = 0; i < out_vec_sz; ++i) {
               agg_min_double_skip_val(
                   &agg_result,
                   *reinterpret_cast<const double*>(may_alias_ptr(&out_vec[i])),
                   *reinterpret_cast<const double*>(may_alias_ptr(&agg_init_val)));
             }
             return {agg_result, 0};
           }
           default:
             CHECK(false);
         }
       }
     }
     case kMAX:
       if (ti.is_integer() || ti.is_decimal() || ti.is_time() || ti.is_boolean()) {
         int64_t agg_result = agg_init_val;
         for (size_t i = 0; i < out_vec_sz; ++i) {
           agg_max_skip_val(&agg_result, out_vec[i], agg_init_val);
         }
         return {agg_result, 0};
       } else {
         switch (out_byte_width) {
           case 4: {
             int32_t agg_result = static_cast<int32_t>(agg_init_val);
             for (size_t i = 0; i < out_vec_sz; ++i) {
               agg_max_float_skip_val(
                   &agg_result,
                   *reinterpret_cast<const float*>(may_alias_ptr(&out_vec[i])),
                   *reinterpret_cast<const float*>(may_alias_ptr(&agg_init_val)));
             }
             const int64_t converted_bin =
                 float_argument_input ? static_cast<int64_t>(agg_result)
                                      : float_to_double_bin(agg_result, !ti.get_notnull());
             return {converted_bin, 0};
           }
           case 8: {
             int64_t agg_result = agg_init_val;
             for (size_t i = 0; i < out_vec_sz; ++i) {
               agg_max_double_skip_val(
                   &agg_result,
                   *reinterpret_cast<const double*>(may_alias_ptr(&out_vec[i])),
                   *reinterpret_cast<const double*>(may_alias_ptr(&agg_init_val)));
             }
             return {agg_result, 0};
           }
           default:
             CHECK(false);
         }
       }
     case kSINGLE_VALUE: {
       int64_t agg_result = agg_init_val;
       for (size_t i = 0; i < out_vec_sz; ++i) {
         if (out_vec[i] != agg_init_val) {
           if (agg_result == agg_init_val) {
             agg_result = out_vec[i];
           } else if (out_vec[i] != agg_result) {
             return {agg_result, int32_t(ErrorCode::SINGLE_VALUE_FOUND_MULTIPLE_VALUES)};
           }
         }
       }
       return {agg_result, 0};
     }
     case kSAMPLE: {
       int64_t agg_result = agg_init_val;
       for (size_t i = 0; i < out_vec_sz; ++i) {
         if (out_vec[i] != agg_init_val) {
           agg_result = out_vec[i];
           break;
         }
       }
       return {agg_result, 0};
     }
     default:
       UNREACHABLE() << "Unsupported SQLAgg: " << agg;
   }
   abort();
 }

Here is the call graph for this function:

Here is the caller graph for this function:

ResultSetPtr Executor::reduceSpeculativeTopN	(	const RelAlgExecutionUnit &	ra_exe_unit,
		std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &	all_fragment_results,
		std::shared_ptr< RowSetMemoryOwner >	row_set_mem_owner,
		const QueryMemoryDescriptor &	query_mem_desc
	)		const

private

Definition at line 1724 of file Execute.cpp.

References SpeculativeTopNMap::asRows(), CHECK, CHECK_EQ, SortInfo::limit, SortInfo::offset, SortInfo::order_entries, SpeculativeTopNMap::reduce(), run_benchmark_import::result, report::rows, RelAlgExecutionUnit::sort_info, and RelAlgExecutionUnit::target_exprs.

Referenced by collectAllDeviceResults().

                                                        {
   if (results_per_device.size() == 1) {
     return std::move(results_per_device.front().first);
   }
   const auto top_n =
       ra_exe_unit.sort_info.limit.value_or(0) + ra_exe_unit.sort_info.offset;
   SpeculativeTopNMap m;
   for (const auto& result : results_per_device) {
     auto rows = result.first;
     CHECK(rows);
     if (!rows) {
       continue;
     }
     SpeculativeTopNMap that(
         *rows,
         ra_exe_unit.target_exprs,
         std::max(size_t(10000 * std::max(1, static_cast<int>(log(top_n)))), top_n));
     m.reduce(that);
   }
   CHECK_EQ(size_t(1), ra_exe_unit.sort_info.order_entries.size());
   const auto desc = ra_exe_unit.sort_info.order_entries.front().is_desc;
   return m.asRows(ra_exe_unit, row_set_mem_owner, query_mem_desc, this, top_n, desc);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::registerActiveModule	(	void *	module,
		const int	device_id
	)

static

Definition at line 20 of file GpuInterrupt.cpp.

References CHECK_LT, to_string(), and VLOG.

                                                                      {
 #ifdef HAVE_CUDA
   std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
   CHECK_LT(device_id, max_gpu_count);
   gpu_active_modules_device_mask_ |= (1 << device_id);
   gpu_active_modules_[device_id] = module;
   VLOG(1) << "Registered module " << module << " on device " << std::to_string(device_id);
 #endif
 }

Here is the call graph for this function:

template<typename F >

static void Executor::registerExtensionFunctions ( F register_extension_functions )

inlinestatic

Definition at line 470 of file Execute.h.

References execute_mutex_, executors_, executors_cache_mutex_, register_runtime_extension_functions_mutex_, and update_after_registration().

Referenced by DBHandler::register_runtime_extension_functions().

                                                                          {
     // Don't want native code to vanish while executing:
     heavyai::unique_lock<heavyai::shared_mutex> flush_lock(execute_mutex_);
     // Blocks Executor::getExecutor:
     heavyai::unique_lock<heavyai::shared_mutex> lock(executors_cache_mutex_);
     // Lock registration to avoid
     // java.util.ConcurrentModificationException from calcite server
     // when client registrations arrive too fast.  Also blocks
     // Executor::get_rt_udf_module for retrieving runtime UDF/UDTF
     // module until this registration has rebuild it via
     // Executor::update_after_registration:
     std::lock_guard<std::mutex> register_lock(
         register_runtime_extension_functions_mutex_);
 
     // Reset all executors:
     for (auto& executor_item : Executor::executors_) {
       executor_item.second->reset(/*discard_runtime_modules_only=*/true);
     }
     // Call registration worker, see
     // DBHandler::register_runtime_extension_functions for details. In
     // short, updates Executor::extension_module_sources,
     // table_functions::TableFunctionsFactory, and registers runtime
     // extension functions with Calcite:
     register_extension_functions();
 
     // Update executors with registered LLVM modules:
     update_after_registration(/*update_runtime_modules_only=*/true);
   }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::registerExtractedQueryPlanDag ( const QueryPlanDAG & query_plan_dag )

Definition at line 5376 of file Execute.cpp.

References latest_query_plan_extracted_.

                                                                                {
   // this function is called under the recycler lock
   // e.g., QueryPlanDagExtractor::extractQueryPlanDagImpl()
   latest_query_plan_extracted_ = query_plan_dag;
 }

bool Executor::removeFromQuerySessionList	(	const QuerySessionId &	query_session,
		const std::string &	submitted_time_str,
		heavyai::unique_lock< heavyai::shared_mutex > &	write_lock
	)

Definition at line 5209 of file Execute.cpp.

References executor_id_, interrupted_, queries_interrupt_flag_, and queries_session_map_.

Referenced by clearQuerySessionStatus().

                                                          {
   if (query_session.empty()) {
     return false;
   }
   if (queries_session_map_.count(query_session)) {
     auto& storage = queries_session_map_.at(query_session);
     if (storage.size() > 1) {
       // in this case we only remove query executor info
       for (auto it = storage.begin(); it != storage.end(); it++) {
         auto target_submitted_t_str = it->second.getQuerySubmittedTime();
         // no time difference && have the same executor id--> found the target query
         if (it->second.getExecutorId() == executor_id_ &&
             submitted_time_str.compare(target_submitted_t_str) == 0) {
           storage.erase(it);
           return true;
         }
       }
     } else if (storage.size() == 1) {
       // here this session only has a single query executor
       // so we clear both executor info and its interrupt flag
       queries_session_map_.erase(query_session);
       queries_interrupt_flag_.erase(query_session);
       if (interrupted_.load()) {
         interrupted_.store(false);
       }
       return true;
     }
   }
   return false;
 }

Here is the caller graph for this function:

void Executor::reset ( bool discard_runtime_modules_only = false )

Definition at line 327 of file Execute.cpp.

References QueryEngine::getInstance(), rt_udf_cpu_module, and rt_udf_gpu_module.

                                                       {
   // TODO: keep cached results that do not depend on runtime UDF/UDTFs
   auto qe = QueryEngine::getInstance();
   qe->s_code_accessor->clear();
   qe->s_stubs_accessor->clear();
   qe->cpu_code_accessor->clear();
   qe->gpu_code_accessor->clear();
   qe->tf_code_accessor->clear();
 
   if (discard_runtime_modules_only) {
     extension_modules_.erase(Executor::ExtModuleKinds::rt_udf_cpu_module);
 #ifdef HAVE_CUDA
     extension_modules_.erase(Executor::ExtModuleKinds::rt_udf_gpu_module);
 #endif
     cgen_state_->module_ = nullptr;
   } else {
     extension_modules_.clear();
     cgen_state_.reset();
     context_.reset(new llvm::LLVMContext());
     cgen_state_.reset(new CgenState({}, false, this));
   }
 }

Here is the call graph for this function:

void Executor::resetBlockSize ( )

Definition at line 4388 of file Execute.cpp.

References block_size_x_.

                               {
   block_size_x_ = 0;
 }

void Executor::resetGridSize ( )

Definition at line 4380 of file Execute.cpp.

References grid_size_x_.

                              {
   grid_size_x_ = 0;
 }

void Executor::resetInterrupt ( )

Definition at line 216 of file GpuInterrupt.cpp.

References check_interrupt_init(), DW_RESET, dynamic_watchdog_init(), g_enable_dynamic_watchdog, g_enable_non_kernel_time_query_interrupt, g_enable_runtime_query_interrupt, INT_RESET, unregisterActiveModule(), and VLOG.

Referenced by clearQuerySessionStatus().

                               {
   const auto allow_interrupt =
       g_enable_runtime_query_interrupt || g_enable_non_kernel_time_query_interrupt;
   if (g_enable_dynamic_watchdog) {
     dynamic_watchdog_init(static_cast<unsigned>(DW_RESET));
   } else if (allow_interrupt) {
 #ifdef HAVE_CUDA
     for (int device_id = 0; device_id < max_gpu_count; device_id++) {
       Executor::unregisterActiveModule(device_id);
     }
 #endif
     VLOG(1) << "Reset interrupt flag for CPU execution kernel on Executor "
             << executor_id_;
     check_interrupt_init(static_cast<unsigned>(INT_RESET));
   }
 
   if (interrupted_.load()) {
     VLOG(1) << "RESET Executor " << executor_id_
             << " that had previously been interrupted";
     interrupted_.store(false);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

ResultSetPtr Executor::resultsUnion	(	SharedKernelContext &	shared_context,
		const RelAlgExecutionUnit &	ra_exe_unit
	)

private

Definition at line 1563 of file Execute.cpp.

References blockSize(), CHECK_GE, CPU, DEBUG_TIMER, anonymous_namespace{Execute.cpp}::get_merged_result(), SharedKernelContext::getFragmentResults(), gridSize(), QueryMemoryDescriptor, row_set_mem_owner_, gpu_enabled::sort(), and RelAlgExecutionUnit::target_exprs.

Referenced by executeWorkUnitImpl().

                                                                             {
   auto timer = DEBUG_TIMER(__func__);
   auto& results_per_device = shared_context.getFragmentResults();
   auto const targets = shared::transform<std::vector<TargetInfo>>(
       ra_exe_unit.target_exprs, GetTargetInfo{});
   if (results_per_device.empty()) {
     return std::make_shared<ResultSet>(targets,
                                        ExecutorDeviceType::CPU,
                                        QueryMemoryDescriptor(),
                                        row_set_mem_owner_,
                                        blockSize(),
                                        gridSize());
   }
   using IndexedResultSet = std::pair<ResultSetPtr, std::vector<size_t>>;
   std::sort(results_per_device.begin(),
             results_per_device.end(),
             [](const IndexedResultSet& lhs, const IndexedResultSet& rhs) {
               CHECK_GE(lhs.second.size(), size_t(1));
               CHECK_GE(rhs.second.size(), size_t(1));
               return lhs.second.front() < rhs.second.front();
             });
 
   return get_merged_result(results_per_device, targets);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::resume_executor_queue ( )

static

Definition at line 5429 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

Referenced by anonymous_namespace{DBHandler.cpp}::pause_and_resume_executor_queue(), and DBHandler::resume_executor_queue().

                                      {
   if (!g_enable_executor_resource_mgr) {
     throw std::runtime_error(
         "Executor queue cannot be resumed as it requires Executor Resource Manager to be "
         "enabled");
   }
   executor_resource_mgr_->resume_process_queue();
 }

Here is the caller graph for this function:

std::vector< int8_t > Executor::serializeLiterals	(	const std::unordered_map< int, CgenState::LiteralValues > &	literals,
		const int	device_id
	)

private

Definition at line 1060 of file Execute.cpp.

References CgenState::addAligned(), align(), CHECK, CHECK_EQ, CHECK_LE, g_enable_string_functions, StringDictionaryProxy::getIdOfString(), StringDictionaryProxy::getOrAddTransient(), getStringDictionaryProxy(), CgenState::literalBytes(), and row_set_mem_owner_.

Referenced by executePlanWithGroupBy(), and executePlanWithoutGroupBy().

                          {
   if (literals.empty()) {
     return {};
   }
   const auto dev_literals_it = literals.find(device_id);
   CHECK(dev_literals_it != literals.end());
   const auto& dev_literals = dev_literals_it->second;
   size_t lit_buf_size{0};
   std::vector<std::string> real_strings;
   std::vector<std::vector<double>> double_array_literals;
   std::vector<std::vector<int8_t>> align64_int8_array_literals;
   std::vector<std::vector<int32_t>> int32_array_literals;
   std::vector<std::vector<int8_t>> align32_int8_array_literals;
   std::vector<std::vector<int8_t>> int8_array_literals;
   for (const auto& lit : dev_literals) {
     lit_buf_size = CgenState::addAligned(lit_buf_size, CgenState::literalBytes(lit));
     if (lit.which() == 7) {
       const auto p = boost::get<std::string>(&lit);
       CHECK(p);
       real_strings.push_back(*p);
     } else if (lit.which() == 8) {
       const auto p = boost::get<std::vector<double>>(&lit);
       CHECK(p);
       double_array_literals.push_back(*p);
     } else if (lit.which() == 9) {
       const auto p = boost::get<std::vector<int32_t>>(&lit);
       CHECK(p);
       int32_array_literals.push_back(*p);
     } else if (lit.which() == 10) {
       const auto p = boost::get<std::vector<int8_t>>(&lit);
       CHECK(p);
       int8_array_literals.push_back(*p);
     } else if (lit.which() == 11) {
       const auto p = boost::get<std::pair<std::vector<int8_t>, int>>(&lit);
       CHECK(p);
       if (p->second == 64) {
         align64_int8_array_literals.push_back(p->first);
       } else if (p->second == 32) {
         align32_int8_array_literals.push_back(p->first);
       } else {
         CHECK(false);
       }
     }
   }
   if (lit_buf_size > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
     throw TooManyLiterals();
   }
   int16_t crt_real_str_off = lit_buf_size;
   for (const auto& real_str : real_strings) {
     CHECK_LE(real_str.size(), static_cast<size_t>(std::numeric_limits<int16_t>::max()));
     lit_buf_size += real_str.size();
   }
   if (double_array_literals.size() > 0) {
     lit_buf_size = align(lit_buf_size, sizeof(double));
   }
   int16_t crt_double_arr_lit_off = lit_buf_size;
   for (const auto& double_array_literal : double_array_literals) {
     CHECK_LE(double_array_literal.size(),
              static_cast<size_t>(std::numeric_limits<int16_t>::max()));
     lit_buf_size += double_array_literal.size() * sizeof(double);
   }
   if (align64_int8_array_literals.size() > 0) {
     lit_buf_size = align(lit_buf_size, sizeof(uint64_t));
   }
   int16_t crt_align64_int8_arr_lit_off = lit_buf_size;
   for (const auto& align64_int8_array_literal : align64_int8_array_literals) {
     CHECK_LE(align64_int8_array_literals.size(),
              static_cast<size_t>(std::numeric_limits<int16_t>::max()));
     lit_buf_size += align64_int8_array_literal.size();
   }
   if (int32_array_literals.size() > 0) {
     lit_buf_size = align(lit_buf_size, sizeof(int32_t));
   }
   int16_t crt_int32_arr_lit_off = lit_buf_size;
   for (const auto& int32_array_literal : int32_array_literals) {
     CHECK_LE(int32_array_literal.size(),
              static_cast<size_t>(std::numeric_limits<int16_t>::max()));
     lit_buf_size += int32_array_literal.size() * sizeof(int32_t);
   }
   if (align32_int8_array_literals.size() > 0) {
     lit_buf_size = align(lit_buf_size, sizeof(int32_t));
   }
   int16_t crt_align32_int8_arr_lit_off = lit_buf_size;
   for (const auto& align32_int8_array_literal : align32_int8_array_literals) {
     CHECK_LE(align32_int8_array_literals.size(),
              static_cast<size_t>(std::numeric_limits<int16_t>::max()));
     lit_buf_size += align32_int8_array_literal.size();
   }
   int16_t crt_int8_arr_lit_off = lit_buf_size;
   for (const auto& int8_array_literal : int8_array_literals) {
     CHECK_LE(int8_array_literal.size(),
              static_cast<size_t>(std::numeric_limits<int16_t>::max()));
     lit_buf_size += int8_array_literal.size();
   }
   unsigned crt_real_str_idx = 0;
   unsigned crt_double_arr_lit_idx = 0;
   unsigned crt_align64_int8_arr_lit_idx = 0;
   unsigned crt_int32_arr_lit_idx = 0;
   unsigned crt_align32_int8_arr_lit_idx = 0;
   unsigned crt_int8_arr_lit_idx = 0;
   std::vector<int8_t> serialized(lit_buf_size);
   size_t off{0};
   for (const auto& lit : dev_literals) {
     const auto lit_bytes = CgenState::literalBytes(lit);
     off = CgenState::addAligned(off, lit_bytes);
     switch (lit.which()) {
       case 0: {
         const auto p = boost::get<int8_t>(&lit);
         CHECK(p);
         serialized[off - lit_bytes] = *p;
         break;
       }
       case 1: {
         const auto p = boost::get<int16_t>(&lit);
         CHECK(p);
         memcpy(&serialized[off - lit_bytes], p, lit_bytes);
         break;
       }
       case 2: {
         const auto p = boost::get<int32_t>(&lit);
         CHECK(p);
         memcpy(&serialized[off - lit_bytes], p, lit_bytes);
         break;
       }
       case 3: {
         const auto p = boost::get<int64_t>(&lit);
         CHECK(p);
         memcpy(&serialized[off - lit_bytes], p, lit_bytes);
         break;
       }
       case 4: {
         const auto p = boost::get<float>(&lit);
         CHECK(p);
         memcpy(&serialized[off - lit_bytes], p, lit_bytes);
         break;
       }
       case 5: {
         const auto p = boost::get<double>(&lit);
         CHECK(p);
         memcpy(&serialized[off - lit_bytes], p, lit_bytes);
         break;
       }
       case 6: {
         const auto p = boost::get<std::pair<std::string, shared::StringDictKey>>(&lit);
         CHECK(p);
         const auto str_id =
             g_enable_string_functions
                 ? getStringDictionaryProxy(p->second, row_set_mem_owner_, true)
                       ->getOrAddTransient(p->first)
                 : getStringDictionaryProxy(p->second, row_set_mem_owner_, true)
                       ->getIdOfString(p->first);
         memcpy(&serialized[off - lit_bytes], &str_id, lit_bytes);
         break;
       }
       case 7: {
         const auto p = boost::get<std::string>(&lit);
         CHECK(p);
         int32_t off_and_len = crt_real_str_off << 16;
         const auto& crt_real_str = real_strings[crt_real_str_idx];
         off_and_len |= static_cast<int16_t>(crt_real_str.size());
         memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
         memcpy(&serialized[crt_real_str_off], crt_real_str.data(), crt_real_str.size());
         ++crt_real_str_idx;
         crt_real_str_off += crt_real_str.size();
         break;
       }
       case 8: {
         const auto p = boost::get<std::vector<double>>(&lit);
         CHECK(p);
         int32_t off_and_len = crt_double_arr_lit_off << 16;
         const auto& crt_double_arr_lit = double_array_literals[crt_double_arr_lit_idx];
         int32_t len = crt_double_arr_lit.size();
         CHECK_EQ((len >> 16), 0);
         off_and_len |= static_cast<int16_t>(len);
         int32_t double_array_bytesize = len * sizeof(double);
         memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
         memcpy(&serialized[crt_double_arr_lit_off],
                crt_double_arr_lit.data(),
                double_array_bytesize);
         ++crt_double_arr_lit_idx;
         crt_double_arr_lit_off += double_array_bytesize;
         break;
       }
       case 9: {
         const auto p = boost::get<std::vector<int32_t>>(&lit);
         CHECK(p);
         int32_t off_and_len = crt_int32_arr_lit_off << 16;
         const auto& crt_int32_arr_lit = int32_array_literals[crt_int32_arr_lit_idx];
         int32_t len = crt_int32_arr_lit.size();
         CHECK_EQ((len >> 16), 0);
         off_and_len |= static_cast<int16_t>(len);
         int32_t int32_array_bytesize = len * sizeof(int32_t);
         memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
         memcpy(&serialized[crt_int32_arr_lit_off],
                crt_int32_arr_lit.data(),
                int32_array_bytesize);
         ++crt_int32_arr_lit_idx;
         crt_int32_arr_lit_off += int32_array_bytesize;
         break;
       }
       case 10: {
         const auto p = boost::get<std::vector<int8_t>>(&lit);
         CHECK(p);
         int32_t off_and_len = crt_int8_arr_lit_off << 16;
         const auto& crt_int8_arr_lit = int8_array_literals[crt_int8_arr_lit_idx];
         int32_t len = crt_int8_arr_lit.size();
         CHECK_EQ((len >> 16), 0);
         off_and_len |= static_cast<int16_t>(len);
         int32_t int8_array_bytesize = len;
         memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
         memcpy(&serialized[crt_int8_arr_lit_off],
                crt_int8_arr_lit.data(),
                int8_array_bytesize);
         ++crt_int8_arr_lit_idx;
         crt_int8_arr_lit_off += int8_array_bytesize;
         break;
       }
       case 11: {
         const auto p = boost::get<std::pair<std::vector<int8_t>, int>>(&lit);
         CHECK(p);
         if (p->second == 64) {
           int32_t off_and_len = crt_align64_int8_arr_lit_off << 16;
           const auto& crt_align64_int8_arr_lit =
               align64_int8_array_literals[crt_align64_int8_arr_lit_idx];
           int32_t len = crt_align64_int8_arr_lit.size();
           CHECK_EQ((len >> 16), 0);
           off_and_len |= static_cast<int16_t>(len);
           int32_t align64_int8_array_bytesize = len;
           memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
           memcpy(&serialized[crt_align64_int8_arr_lit_off],
                  crt_align64_int8_arr_lit.data(),
                  align64_int8_array_bytesize);
           ++crt_align64_int8_arr_lit_idx;
           crt_align64_int8_arr_lit_off += align64_int8_array_bytesize;
         } else if (p->second == 32) {
           int32_t off_and_len = crt_align32_int8_arr_lit_off << 16;
           const auto& crt_align32_int8_arr_lit =
               align32_int8_array_literals[crt_align32_int8_arr_lit_idx];
           int32_t len = crt_align32_int8_arr_lit.size();
           CHECK_EQ((len >> 16), 0);
           off_and_len |= static_cast<int16_t>(len);
           int32_t align32_int8_array_bytesize = len;
           memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
           memcpy(&serialized[crt_align32_int8_arr_lit_off],
                  crt_align32_int8_arr_lit.data(),
                  align32_int8_array_bytesize);
           ++crt_align32_int8_arr_lit_idx;
           crt_align32_int8_arr_lit_off += align32_int8_array_bytesize;
         } else {
           CHECK(false);
         }
         break;
       }
       default:
         CHECK(false);
     }
   }
   return serialized;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void Executor::set_concurrent_resource_grant_policy ( const ExecutorResourceMgr_Namespace::ConcurrentResourceGrantPolicy & concurrent_resource_grant_policy )

static

Definition at line 5477 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

                                           {
   if (!g_enable_executor_resource_mgr) {
     throw std::runtime_error(
         "ExecutorResourceMgr must be enabled to set executor concurrent resource grant "
         "policy.");
   }
   executor_resource_mgr_->set_concurrent_resource_grant_policy(
       concurrent_resource_grant_policy);
 }

void Executor::set_executor_resource_pool_resource	(	const ExecutorResourceMgr_Namespace::ResourceType	resource_type,
		const size_t	resource_quantity
	)

static

Definition at line 5456 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

                                     {
   if (!g_enable_executor_resource_mgr) {
     throw std::runtime_error(
         "ExecutorResourceMgr must be enabled to set executor resource pool resource.");
   }
   executor_resource_mgr_->set_resource(resource_type, resource_quantity);
 }

void Executor::setBlockSize ( unsigned block_size )

Definition at line 4384 of file Execute.cpp.

References block_size_x_.

                                                {
   block_size_x_ = block_size;
 }

void Executor::setColRangeCache ( const AggregatedColRange & aggregated_col_range )

inline

Definition at line 1329 of file Execute.h.

References agg_col_range_cache_.

                                                                         {
     agg_col_range_cache_ = aggregated_col_range;
   }

void Executor::setGridSize ( unsigned grid_size )

Definition at line 4376 of file Execute.cpp.

References grid_size_x_.

                                              {
   grid_size_x_ = grid_size;
 }

void Executor::setQuerySessionAsInterrupted	(	const QuerySessionId &	query_session,
		heavyai::unique_lock< heavyai::shared_mutex > &	write_lock
	)

Definition at line 5243 of file Execute.cpp.

References queries_interrupt_flag_.

                                                          {
   if (query_session.empty()) {
     return;
   }
   if (queries_interrupt_flag_.find(query_session) != queries_interrupt_flag_.end()) {
     queries_interrupt_flag_[query_session] = true;
   }
 }

void Executor::setupCaching	(	const std::unordered_set< PhysicalInput > &	phys_inputs,
		const std::unordered_set< shared::TableKey > &	phys_table_keys
	)

Definition at line 4960 of file Execute.cpp.

References agg_col_range_cache_, computeColRangesCache(), computeStringDictionaryGenerations(), computeTableGenerations(), executor_id_, getArenaBlockSize(), row_set_mem_owner_, and table_generations_.

                                                                                     {
   row_set_mem_owner_ =
       std::make_shared<RowSetMemoryOwner>(Executor::getArenaBlockSize(), executor_id_);
   row_set_mem_owner_->setDictionaryGenerations(
       computeStringDictionaryGenerations(phys_inputs));
   agg_col_range_cache_ = computeColRangesCache(phys_inputs);
   table_generations_ = computeTableGenerations(phys_table_ids);
 }

Here is the call graph for this function:

std::pair< bool, int64_t > Executor::skipFragment	(	const InputDescriptor &	table_desc,
		const Fragmenter_Namespace::FragmentInfo &	frag_info,
		const std::list< std::shared_ptr< Analyzer::Expr >> &	simple_quals,
		const std::vector< uint64_t > &	frag_offsets,
		const size_t	frag_idx
	)

private

Definition at line 4658 of file Execute.cpp.

References canSkipFragmentForFpQual(), CHECK, CodeGenerator::codegenIntConst(), DateTruncateHighPrecisionToDate(), extract_max_stat_int_type(), extract_min_stat_int_type(), get_column_descriptor(), anonymous_namespace{Execute.cpp}::get_hpt_overflow_underflow_safe_scaled_values(), Analyzer::BinOper::get_left_operand(), Fragmenter_Namespace::FragmentInfo::getChunkMetadataMap(), getTableGeneration(), InputDescriptor::getTableKey(), INVALID, isFragmentFullyDeleted(), kCAST, kEQ, kGE, kGT, kLE, kLT, kTIME, NOT_SKIPPABLE, Fragmenter_Namespace::FragmentInfo::physicalTableId, SKIPPABLE, to_string(), UNREACHABLE, and VLOG.

Referenced by skipFragmentInnerJoins().

                            {
   // First check to see if all of fragment is deleted, in which case we know we can skip
   if (isFragmentFullyDeleted(table_desc, fragment)) {
     VLOG(2) << "Skipping deleted fragment with table id: " << fragment.physicalTableId
             << ", fragment id: " << frag_idx;
     return {true, -1};
   }
 
   for (const auto& simple_qual : simple_quals) {
     const auto comp_expr =
         std::dynamic_pointer_cast<const Analyzer::BinOper>(simple_qual);
     if (!comp_expr) {
       // is this possible?
       return {false, -1};
     }
     const auto lhs = comp_expr->get_left_operand();
     auto lhs_col = dynamic_cast<const Analyzer::ColumnVar*>(lhs);
     if (!lhs_col || !lhs_col->getColumnKey().table_id || lhs_col->get_rte_idx()) {
       // See if lhs is a simple cast that was allowed through normalize_simple_predicate
       auto lhs_uexpr = dynamic_cast<const Analyzer::UOper*>(lhs);
       if (lhs_uexpr) {
         CHECK(lhs_uexpr->get_optype() ==
               kCAST);  // We should have only been passed a cast expression
         lhs_col = dynamic_cast<const Analyzer::ColumnVar*>(lhs_uexpr->get_operand());
         if (!lhs_col || !lhs_col->getColumnKey().table_id || lhs_col->get_rte_idx()) {
           continue;
         }
       } else {
         continue;
       }
     }
     const auto rhs = comp_expr->get_right_operand();
     const auto rhs_const = dynamic_cast<const Analyzer::Constant*>(rhs);
     if (!rhs_const) {
       // is this possible?
       return {false, -1};
     }
     if (!lhs->get_type_info().is_integer() && !lhs->get_type_info().is_time() &&
         !lhs->get_type_info().is_fp()) {
       continue;
     }
     if (lhs->get_type_info().is_fp()) {
       const auto fragment_skip_status =
           canSkipFragmentForFpQual(comp_expr.get(), lhs_col, fragment, rhs_const);
       switch (fragment_skip_status) {
         case FragmentSkipStatus::SKIPPABLE:
           return {true, -1};
         case FragmentSkipStatus::INVALID:
           return {false, -1};
         case FragmentSkipStatus::NOT_SKIPPABLE:
           continue;
         default:
           UNREACHABLE();
       }
     }
 
     // Everything below is logic for integer and integer-backed timestamps
     // TODO: Factor out into separate function per canSkipFragmentForFpQual above
 
     if (lhs_col->get_type_info().is_timestamp() &&
         rhs_const->get_type_info().is_any<kTIME>()) {
       // when casting from a timestamp to time
       // is not possible to get a valid range
       // so we can't skip any fragment
       continue;
     }
 
     const int col_id = lhs_col->getColumnKey().column_id;
     auto chunk_meta_it = fragment.getChunkMetadataMap().find(col_id);
     int64_t chunk_min{0};
     int64_t chunk_max{0};
     bool is_rowid{false};
     size_t start_rowid{0};
     const auto& table_key = table_desc.getTableKey();
     if (chunk_meta_it == fragment.getChunkMetadataMap().end()) {
       auto cd = get_column_descriptor({table_key, col_id});
       if (cd->isVirtualCol) {
         CHECK(cd->columnName == "rowid");
         const auto& table_generation = getTableGeneration(table_key);
         start_rowid = table_generation.start_rowid;
         chunk_min = frag_offsets[frag_idx] + start_rowid;
         chunk_max = frag_offsets[frag_idx + 1] - 1 + start_rowid;
         is_rowid = true;
       }
     } else {
       const auto& chunk_type = lhs_col->get_type_info();
       chunk_min =
           extract_min_stat_int_type(chunk_meta_it->second->chunkStats, chunk_type);
       chunk_max =
           extract_max_stat_int_type(chunk_meta_it->second->chunkStats, chunk_type);
     }
     if (chunk_min > chunk_max) {
       // invalid metadata range, do not skip fragment
       return {false, -1};
     }
     if (lhs->get_type_info().is_timestamp() &&
         (lhs_col->get_type_info().get_dimension() !=
          rhs_const->get_type_info().get_dimension()) &&
         (lhs_col->get_type_info().is_high_precision_timestamp() ||
          rhs_const->get_type_info().is_high_precision_timestamp())) {
       // If original timestamp lhs col has different precision,
       // column metadata holds value in original precision
       // therefore adjust rhs value to match lhs precision
 
       // Note(Wamsi): We adjust rhs const value instead of lhs value to not
       // artificially limit the lhs column range. RHS overflow/underflow is already
       // been validated in `TimeGM::get_overflow_underflow_safe_epoch`.
       bool is_valid;
       std::tie(is_valid, chunk_min, chunk_max) =
           get_hpt_overflow_underflow_safe_scaled_values(
               chunk_min, chunk_max, lhs_col->get_type_info(), rhs_const->get_type_info());
       if (!is_valid) {
         VLOG(4) << "Overflow/Underflow detecting in fragments skipping logic.\nChunk min "
                    "value: "
                 << std::to_string(chunk_min)
                 << "\nChunk max value: " << std::to_string(chunk_max)
                 << "\nLHS col precision is: "
                 << std::to_string(lhs_col->get_type_info().get_dimension())
                 << "\nRHS precision is: "
                 << std::to_string(rhs_const->get_type_info().get_dimension()) << ".";
         return {false, -1};
       }
     }
     if (lhs_col->get_type_info().is_timestamp() && rhs_const->get_type_info().is_date()) {
       // It is obvious that a cast from timestamp to date is happening here,
       // so we have to correct the chunk min and max values to lower the precision as of
       // the date
       chunk_min = DateTruncateHighPrecisionToDate(
           chunk_min, pow(10, lhs_col->get_type_info().get_dimension()));
       chunk_max = DateTruncateHighPrecisionToDate(
           chunk_max, pow(10, lhs_col->get_type_info().get_dimension()));
     }
     llvm::LLVMContext local_context;
     CgenState local_cgen_state(local_context);
     CodeGenerator code_generator(&local_cgen_state, nullptr);
 
     const auto rhs_val =
         CodeGenerator::codegenIntConst(rhs_const, &local_cgen_state)->getSExtValue();
 
     switch (comp_expr->get_optype()) {
       case kGE:
         if (chunk_max < rhs_val) {
           return {true, -1};
         }
         break;
       case kGT:
         if (chunk_max <= rhs_val) {
           return {true, -1};
         }
         break;
       case kLE:
         if (chunk_min > rhs_val) {
           return {true, -1};
         }
         break;
       case kLT:
         if (chunk_min >= rhs_val) {
           return {true, -1};
         }
         break;
       case kEQ:
         if (chunk_min > rhs_val || chunk_max < rhs_val) {
           return {true, -1};
         } else if (is_rowid) {
           return {false, rhs_val - start_rowid};
         }
         break;
       default:
         break;
     }
   }
   return {false, -1};
 }

Here is the call graph for this function:

Here is the caller graph for this function:

std::pair< bool, int64_t > Executor::skipFragmentInnerJoins	(	const InputDescriptor &	table_desc,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const Fragmenter_Namespace::FragmentInfo &	fragment,
		const std::vector< uint64_t > &	frag_offsets,
		const size_t	frag_idx
	)

private

Definition at line 4861 of file Execute.cpp.

References INNER, RelAlgExecutionUnit::join_quals, qual_to_conjunctive_form(), and skipFragment().

                            {
   std::pair<bool, int64_t> skip_frag{false, -1};
   for (auto& inner_join : ra_exe_unit.join_quals) {
     if (inner_join.type != JoinType::INNER) {
       continue;
     }
 
     // extracting all the conjunctive simple_quals from the quals stored for the inner
     // join
     std::list<std::shared_ptr<Analyzer::Expr>> inner_join_simple_quals;
     for (auto& qual : inner_join.quals) {
       auto temp_qual = qual_to_conjunctive_form(qual);
       inner_join_simple_quals.insert(inner_join_simple_quals.begin(),
                                      temp_qual.simple_quals.begin(),
                                      temp_qual.simple_quals.end());
     }
     auto temp_skip_frag = skipFragment(
         table_desc, fragment, inner_join_simple_quals, frag_offsets, frag_idx);
     if (temp_skip_frag.second != -1) {
       skip_frag.second = temp_skip_frag.second;
       return skip_frag;
     } else {
       skip_frag.first = skip_frag.first || temp_skip_frag.first;
     }
   }
   return skip_frag;
 }

Here is the call graph for this function:

bool Executor::skipFragmentPair	(	const Fragmenter_Namespace::FragmentInfo &	outer_fragment_info,
		const Fragmenter_Namespace::FragmentInfo &	inner_fragment_info,
		const int	inner_table_id,
		const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &	inner_table_id_to_join_condition,
		const RelAlgExecutionUnit &	ra_exe_unit,
		const ExecutorDeviceType	device_type
	)

private

Definition at line 3278 of file Execute.cpp.

References CHECK, CHECK_EQ, get_shard_count(), BaselineJoinHashTable::getShardCountForCondition(), getTemporaryTables(), GPU, RelAlgExecutionUnit::input_descs, RelAlgExecutionUnit::join_quals, HashJoin::normalizeColumnPairs(), plan_state_, and Fragmenter_Namespace::FragmentInfo::shard.

Referenced by getTableFragmentIndices().

                                           {
   if (device_type != ExecutorDeviceType::GPU) {
     return false;
   }
   CHECK(table_idx >= 0 &&
         static_cast<size_t>(table_idx) < ra_exe_unit.input_descs.size());
   const auto& inner_table_key = ra_exe_unit.input_descs[table_idx].getTableKey();
   // Both tables need to be sharded the same way.
   if (outer_fragment_info.shard == -1 || inner_fragment_info.shard == -1 ||
       outer_fragment_info.shard == inner_fragment_info.shard) {
     return false;
   }
   const Analyzer::BinOper* join_condition{nullptr};
   if (ra_exe_unit.join_quals.empty()) {
     CHECK(!inner_table_id_to_join_condition.empty());
     auto condition_it = inner_table_id_to_join_condition.find(inner_table_key);
     CHECK(condition_it != inner_table_id_to_join_condition.end());
     join_condition = condition_it->second;
     CHECK(join_condition);
   } else {
     CHECK_EQ(plan_state_->join_info_.equi_join_tautologies_.size(),
              plan_state_->join_info_.join_hash_tables_.size());
     for (size_t i = 0; i < plan_state_->join_info_.join_hash_tables_.size(); ++i) {
       if (plan_state_->join_info_.join_hash_tables_[i]->getInnerTableRteIdx() ==
           table_idx) {
         CHECK(!join_condition);
         join_condition = plan_state_->join_info_.equi_join_tautologies_[i].get();
       }
     }
   }
   if (!join_condition) {
     return false;
   }
   // TODO(adb): support fragment skipping based on the bounding box intersect operator
   if (join_condition->is_bbox_intersect_oper()) {
     return false;
   }
   size_t shard_count{0};
   if (dynamic_cast<const Analyzer::ExpressionTuple*>(
           join_condition->get_left_operand())) {
     auto inner_outer_pairs =
         HashJoin::normalizeColumnPairs(join_condition, getTemporaryTables()).first;
     shard_count = BaselineJoinHashTable::getShardCountForCondition(
         join_condition, this, inner_outer_pairs);
   } else {
     shard_count = get_shard_count(join_condition, this);
   }
   if (shard_count && !ra_exe_unit.join_quals.empty()) {
     plan_state_->join_info_.sharded_range_table_indices_.emplace(table_idx);
   }
   return shard_count;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

llvm::Value * Executor::spillDoubleElement	(	llvm::Value *	elem_val,
		llvm::Type *	elem_ty
	)

private

Definition at line 19 of file MaxwellCodegenPatch.cpp.

                                                                               {
   auto var_ptr = cgen_state_->ir_builder_.CreateAlloca(elem_ty);
   cgen_state_->ir_builder_.CreateStore(elem_val, var_ptr);
   return var_ptr;
 }

void Executor::unregisterActiveModule ( const int device_id )

static

Definition at line 30 of file GpuInterrupt.cpp.

References CHECK_LT, to_string(), and VLOG.

Referenced by resetInterrupt().

                                                          {
 #ifdef HAVE_CUDA
   std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
   CHECK_LT(device_id, max_gpu_count);
   if ((gpu_active_modules_device_mask_ & (1 << device_id)) == 0) {
     return;
   }
   gpu_active_modules_device_mask_ ^= (1 << device_id);
   VLOG(1) << "Unregistered module on device " << std::to_string(device_id);
 #endif
 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void Executor::update_after_registration ( bool update_runtime_modules_only = false )

inlinestatic

Definition at line 1420 of file Execute.h.

References executors_.

Referenced by registerExtensionFunctions().

                                                                                   {
     for (auto executor_item : Executor::executors_) {
       executor_item.second->update_extension_modules(update_runtime_modules_only);
     }
   }

Here is the caller graph for this function:

void Executor::update_extension_modules ( bool update_runtime_modules_only = false )

is_gpu=

Definition at line 350 of file Execute.cpp.

References CHECK, extension_module_sources, LOG, read_llvm_module_from_bc_file(), read_llvm_module_from_ir_file(), read_llvm_module_from_ir_string(), rt_geos_module, rt_libdevice_module, rt_udf_cpu_module, rt_udf_gpu_module, template_module, toString(), udf_cpu_module, udf_gpu_module, UNREACHABLE, and logger::WARNING.

                                                                         {
   auto read_module = [&](Executor::ExtModuleKinds module_kind,
                          const std::string& source) {
     /*
       source can be either a filename of a LLVM IR
       or LLVM BC source, or a string containing
       LLVM IR code.
      */
     CHECK(!source.empty());
     switch (module_kind) {
       case Executor::ExtModuleKinds::template_module:
       case Executor::ExtModuleKinds::rt_geos_module:
       case Executor::ExtModuleKinds::rt_libdevice_module: {
         return read_llvm_module_from_bc_file(source, getContext());
       }
       case Executor::ExtModuleKinds::udf_cpu_module: {
         return read_llvm_module_from_ir_file(source, getContext(), false);
       }
       case Executor::ExtModuleKinds::udf_gpu_module: {
         return read_llvm_module_from_ir_file(source, getContext(), true);
       }
       case Executor::ExtModuleKinds::rt_udf_cpu_module: {
         return read_llvm_module_from_ir_string(source, getContext(), false);
       }
       case Executor::ExtModuleKinds::rt_udf_gpu_module: {
         return read_llvm_module_from_ir_string(source, getContext(), true);
       }
       default: {
         UNREACHABLE();
         return std::unique_ptr<llvm::Module>();
       }
     }
   };
   auto update_module = [&](Executor::ExtModuleKinds module_kind,
                            bool erase_not_found = false) {
     auto it = Executor::extension_module_sources.find(module_kind);
     if (it != Executor::extension_module_sources.end()) {
       auto llvm_module = read_module(module_kind, it->second);
       if (llvm_module) {
         extension_modules_[module_kind] = std::move(llvm_module);
       } else if (erase_not_found) {
         extension_modules_.erase(module_kind);
       } else {
         if (extension_modules_.find(module_kind) == extension_modules_.end()) {
           LOG(WARNING) << "Failed to update " << ::toString(module_kind)
                        << " LLVM module. The module will be unavailable.";
         } else {
           LOG(WARNING) << "Failed to update " << ::toString(module_kind)
                        << " LLVM module. Using the existing module.";
         }
       }
     } else {
       if (erase_not_found) {
         extension_modules_.erase(module_kind);
       } else {
         if (extension_modules_.find(module_kind) == extension_modules_.end()) {
           LOG(WARNING) << "Source of " << ::toString(module_kind)
                        << " LLVM module is unavailable. The module will be unavailable.";
         } else {
           LOG(WARNING) << "Source of " << ::toString(module_kind)
                        << " LLVM module is unavailable. Using the existing module.";
         }
       }
     }
   };
 
   if (!update_runtime_modules_only) {
     // required compile-time modules, their requirements are enforced
     // by Executor::initialize_extension_module_sources():
     update_module(Executor::ExtModuleKinds::template_module);
 #ifdef ENABLE_GEOS
     update_module(Executor::ExtModuleKinds::rt_geos_module);
 #endif
     // load-time modules, these are optional:
     update_module(Executor::ExtModuleKinds::udf_cpu_module, true);
 #ifdef HAVE_CUDA
     update_module(Executor::ExtModuleKinds::udf_gpu_module, true);
     update_module(Executor::ExtModuleKinds::rt_libdevice_module);
 #endif
   }
   // run-time modules, these are optional and erasable:
   update_module(Executor::ExtModuleKinds::rt_udf_cpu_module, true);
 #ifdef HAVE_CUDA
   update_module(Executor::ExtModuleKinds::rt_udf_gpu_module, true);
 #endif
 }

Here is the call graph for this function:

bool Executor::updateQuerySessionExecutorAssignment	(	const QuerySessionId &	query_session,
		const std::string &	submitted_time_str,
		const size_t	executor_id,
		heavyai::unique_lock< heavyai::shared_mutex > &	write_lock
	)

Definition at line 5184 of file Execute.cpp.

References queries_session_map_.

Referenced by attachExecutorToQuerySession().

                                                          {
   // update the executor id of the query session
   if (query_session.empty()) {
     return false;
   }
   if (queries_session_map_.count(query_session)) {
     auto storage = queries_session_map_.at(query_session);
     for (auto it = storage.begin(); it != storage.end(); it++) {
       auto target_submitted_t_str = it->second.getQuerySubmittedTime();
       // no time difference --> found the target query status
       if (submitted_time_str.compare(target_submitted_t_str) == 0) {
         queries_session_map_.at(query_session)
             .at(submitted_time_str)
             .setExecutorId(executor_id);
         return true;
       }
     }
   }
   return false;
 }

Here is the caller graph for this function:

void Executor::updateQuerySessionStatus	(	const QuerySessionId &	query_session,
		const std::string &	submitted_time_str,
		const QuerySessionStatus::QueryStatus	new_query_status
	)

Definition at line 5075 of file Execute.cpp.

References current_query_session_, executor_session_mutex_, and updateQuerySessionStatusWithLock().

Referenced by executeWorkUnitImpl().

                                                           {
   // update the running query session's the current status
   heavyai::unique_lock<heavyai::shared_mutex> session_write_lock(executor_session_mutex_);
   if (query_session.empty()) {
     return;
   }
   if (new_query_status == QuerySessionStatus::QueryStatus::RUNNING_QUERY_KERNEL) {
     current_query_session_ = query_session;
   }
   updateQuerySessionStatusWithLock(
       query_session, submitted_time_str, new_query_status, session_write_lock);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

bool Executor::updateQuerySessionStatusWithLock	(	const QuerySessionId &	query_session,
		const std::string &	submitted_time_str,
		const QuerySessionStatus::QueryStatus	updated_query_status,
		heavyai::unique_lock< heavyai::shared_mutex > &	write_lock
	)

Definition at line 5158 of file Execute.cpp.

References queries_session_map_.

Referenced by attachExecutorToQuerySession(), and updateQuerySessionStatus().

                                                          {
   // an internal API that updates query session status
   if (query_session.empty()) {
     return false;
   }
   if (queries_session_map_.count(query_session)) {
     for (auto& query_status : queries_session_map_.at(query_session)) {
       auto target_submitted_t_str = query_status.second.getQuerySubmittedTime();
       // no time difference --> found the target query status
       if (submitted_time_str.compare(target_submitted_t_str) == 0) {
         auto prev_status = query_status.second.getQueryStatus();
         if (prev_status == updated_query_status) {
           return false;
         }
         query_status.second.setQueryStatus(updated_query_status);
         return true;
       }
     }
   }
   return false;
 }

Here is the caller graph for this function:

int8_t Executor::warpSize ( ) const

Definition at line 4344 of file Execute.cpp.

References CHECK, cudaMgr(), and CudaMgr_Namespace::CudaMgr::getAllDeviceProperties().

                                 {
   const auto& dev_props = cudaMgr()->getAllDeviceProperties();
   CHECK(!dev_props.empty());
   return dev_props.front().warpSize;
 }

Here is the call graph for this function:

Friends And Related Function Documentation

friend class BaselineJoinHashTable

friend

Definition at line 1630 of file Execute.h.

friend class BoundingBoxIntersectJoinHashTable

friend

Definition at line 1637 of file Execute.h.

friend class CodeGenerator

friend

Definition at line 1631 of file Execute.h.

friend class ColumnFetcher

friend

Definition at line 1632 of file Execute.h.

friend struct DiamondCodegen

friend

Definition at line 1633 of file Execute.h.

friend class ExecutionKernel

friend

Definition at line 1634 of file Execute.h.

friend class GroupByAndAggregate

friend

Definition at line 1639 of file Execute.h.

friend class HashJoin

friend

Definition at line 1636 of file Execute.h.

friend class InValuesBitmap

friend

Definition at line 1646 of file Execute.h.

friend class KernelSubtask

friend

Definition at line 1635 of file Execute.h.

friend class LeafAggregator

friend

Definition at line 1648 of file Execute.h.

friend class PendingExecutionClosure

friend

Definition at line 1651 of file Execute.h.

friend class PerfectJoinHashTable

friend

Definition at line 1649 of file Execute.h.

friend class QueryCompilationDescriptor

friend

Definition at line 1640 of file Execute.h.

friend class QueryExecutionContext

friend

Definition at line 1644 of file Execute.h.

friend class QueryFragmentDescriptor

friend

Definition at line 1643 of file Execute.h.

friend class QueryMemoryDescriptor

friend

Definition at line 1641 of file Execute.h.

Referenced by executeWorkUnitImpl(), reduceMultiDeviceResults(), and resultsUnion().

friend class QueryMemoryInitializer

friend

Definition at line 1642 of file Execute.h.

friend class QueryRewriter

friend

Definition at line 1650 of file Execute.h.

friend class RangeJoinHashTable

friend

Definition at line 1638 of file Execute.h.

friend class RelAlgExecutor

friend

Definition at line 1652 of file Execute.h.

friend class ResultSet

friend

Definition at line 1645 of file Execute.h.

friend class StringDictionaryTranslationMgr

friend

Definition at line 1647 of file Execute.h.

friend class TableFunctionCompilationContext

friend

Definition at line 1654 of file Execute.h.

friend class TableFunctionExecutionContext

friend

Definition at line 1655 of file Execute.h.

friend class TableOptimizer

friend

Definition at line 1653 of file Execute.h.

friend struct TargetExprCodegen

friend

Definition at line 1657 of file Execute.h.

friend struct TargetExprCodegenBuilder

friend

Definition at line 1656 of file Execute.h.

friend class WindowProjectNodeContext

friend

Definition at line 1658 of file Execute.h.

Member Data Documentation

WindowFunctionContext* Executor::active_window_function_ {nullptr}

private

Definition at line 1569 of file Execute.h.

AggregatedColRange Executor::agg_col_range_cache_

private

Definition at line 1572 of file Execute.h.

Referenced by clearMetaInfoCache(), dumpCache(), getColRange(), setColRangeCache(), and setupCaching().

const size_t Executor::auto_cpu_mem_bytes {size_t(0)}

static

Definition at line 1626 of file Execute.h.

Referenced by DBHandler::init_executor_resource_mgr().

const size_t Executor::auto_num_threads {size_t(0)}

staticprivate

Definition at line 1536 of file Execute.h.

Referenced by launchKernelsImpl(), and launchKernelsLocked().

const size_t Executor::baseline_threshold

staticprivate

Initial value:

{

1000000}

Definition at line 1549 of file Execute.h.

Referenced by getBaselineThreshold(), and ResultSet::sort().

unsigned Executor::block_size_x_

private

Definition at line 1552 of file Execute.h.

Referenced by blockSize(), resetBlockSize(), and setBlockSize().

std::unordered_map< CardinalityCacheKey, size_t > Executor::cardinality_cache_

staticprivate

Definition at line 1607 of file Execute.h.

Referenced by addToCardinalityCache(), clearCardinalityCache(), getCachedCardinality(), and invalidateCardinalityCacheForTable().

std::unique_ptr<CgenState> Executor::cgen_state_

private

Definition at line 1502 of file Execute.h.

Referenced by castToFP(), castToIntPtrTyIn(), containsLeftDeepOuterJoin(), executeWorkUnit(), getCgenStatePtr(), nukeOldState(), preloadFragOffsets(), and Executor::CgenStateManager::~CgenStateManager().

std::mutex Executor::compilation_mutex_

Definition at line 1618 of file Execute.h.

int64_t Executor::compilation_queue_time_ms_ = 0

private

Definition at line 1563 of file Execute.h.

Referenced by executeWorkUnit(), and nukeOldState().

std::unique_ptr<llvm::LLVMContext> Executor::context_

private

Definition at line 1477 of file Execute.h.

Referenced by getContext().

QuerySessionId Executor::current_query_session_

private

Definition at line 1576 of file Execute.h.

Referenced by checkCurrentQuerySession(), checkNonKernelTimeInterrupted(), clearQuerySessionStatus(), enrollQuerySession(), getCurrentQuerySession(), invalidateRunningQuerySession(), and updateQuerySessionStatus().

Data_Namespace::DataMgr* Executor::data_mgr_

private

Definition at line 1558 of file Execute.h.

Referenced by blockSize(), createKernels(), cudaMgr(), executeWorkUnitImpl(), getDataMgr(), gridSize(), and isCPUOnly().

const std::string Executor::debug_dir_

private

Definition at line 1555 of file Execute.h.

const std::string Executor::debug_file_

private

Definition at line 1556 of file Execute.h.

heavyai::shared_mutex Executor::execute_mutex_

staticprivate

Definition at line 1585 of file Execute.h.

Referenced by acquireExecuteMutex(), clearMemory(), nukeCacheOfExecutors(), and registerExtensionFunctions().

const ExecutorId Executor::executor_id_

private

Definition at line 1476 of file Execute.h.

Referenced by acquireExecuteMutex(), attachExecutorToQuerySession(), checkNonKernelTimeInterrupted(), executeWorkUnit(), getExecutorId(), logSystemCPUMemoryStatus(), logSystemGPUMemoryStatus(), reduceMultiDeviceResultSets(), removeFromQuerySessionList(), and setupCaching().

std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > Executor::executor_resource_mgr_ = nullptr

static

Definition at line 1628 of file Execute.h.

Referenced by get_concurrent_resource_grant_policy(), get_executor_resource_pool_info(), get_executor_resource_pool_total_resource_quantity(), init_resource_mgr(), launchKernelsViaResourceMgr(), pause_executor_queue(), resume_executor_queue(), set_concurrent_resource_grant_policy(), and set_executor_resource_pool_resource().

heavyai::shared_mutex Executor::executor_session_mutex_

staticprivate

Definition at line 1574 of file Execute.h.

Referenced by attachExecutorToQuerySession(), checkNonKernelTimeInterrupted(), checkPendingQueryStatus(), clearQuerySessionStatus(), enrollQuerySession(), executePlanWithGroupBy(), executePlanWithoutGroupBy(), executeWorkUnitImpl(), fetchChunks(), fetchUnionChunks(), getExecutorIdsRunningQuery(), getNumCurentSessionsEnrolled(), getSessionLock(), and updateQuerySessionStatus().

std::map< int, std::shared_ptr< Executor > > Executor::executors_

staticprivate

Definition at line 1581 of file Execute.h.

Referenced by getExecutor(), nukeCacheOfExecutors(), registerExtensionFunctions(), and update_after_registration().

heavyai::shared_mutex Executor::executors_cache_mutex_

staticprivate

Definition at line 1602 of file Execute.h.

Referenced by getExecutor(), nukeCacheOfExecutors(), and registerExtensionFunctions().

std::map< Executor::ExtModuleKinds, std::string > Executor::extension_module_sources

static

Definition at line 528 of file Execute.h.

Referenced by initialize_extension_module_sources(), DBHandler::register_runtime_extension_functions(), and update_extension_modules().

std::map<ExtModuleKinds, std::unique_ptr<llvm::Module> > Executor::extension_modules_

private

Definition at line 1517 of file Execute.h.

Referenced by get_extension_module(), and has_extension_module().

void * Executor::gpu_active_modules_

staticprivate

Definition at line 1541 of file Execute.h.

uint32_t Executor::gpu_active_modules_device_mask_ {0x0}

staticprivate

Definition at line 1540 of file Execute.h.

std::mutex Executor::gpu_active_modules_mutex_

staticprivate

Definition at line 1539 of file Execute.h.

std::mutex Executor::gpu_exec_mutex_[max_gpu_count]

private

Definition at line 1537 of file Execute.h.

unsigned Executor::grid_size_x_

private

Definition at line 1553 of file Execute.h.

Referenced by gridSize(), numBlocksPerMP(), resetGridSize(), and setGridSize().

InputTableInfoCache Executor::input_table_info_cache_

mutableprivate

Definition at line 1571 of file Execute.h.

Referenced by clearMetaInfoCache(), and getTableInfo().

std::atomic<bool> Executor::interrupted_ {false}

private

Definition at line 1543 of file Execute.h.

Referenced by buildHashTableForQualifier(), executePlanWithGroupBy(), executePlanWithoutGroupBy(), executeWorkUnitImpl(), fetchChunks(), and removeFromQuerySessionList().

constexpr ExecutorId Executor::INVALID_EXECUTOR_ID = SIZE_MAX

static

Definition at line 424 of file Execute.h.

Referenced by CgenState::getExecutor().

std::mutex Executor::kernel_mutex_

static

Definition at line 1624 of file Execute.h.

Referenced by executeWorkUnitPerFragment(), and launchKernelsLocked().

int64_t Executor::kernel_queue_time_ms_ = 0

private

Definition at line 1562 of file Execute.h.

Referenced by executeWorkUnit(), executeWorkUnitPerFragment(), launchKernelsLocked(), launchKernelsViaResourceMgr(), and nukeOldState().

QueryPlanDAG Executor::latest_query_plan_extracted_ {EMPTY_QUERY_PLAN}

staticprivate

Definition at line 1612 of file Execute.h.

Referenced by getLatestQueryPlanDagExtracted(), and registerExtractedQueryPlanDag().

int const Executor::max_gpu_count

staticprivate

Definition at line 1535 of file Execute.h.

Referenced by ExecutionKernel::runImpl().

const size_t Executor::max_gpu_slab_size_

private

Definition at line 1554 of file Execute.h.

Referenced by maxGpuSlabSize().

std::unique_ptr<llvm::TargetMachine> Executor::nvptx_target_machine_

mutableprivate

Definition at line 1547 of file Execute.h.

std::unique_ptr<PlanState> Executor::plan_state_

private

InterruptFlagMap Executor::queries_interrupt_flag_

staticprivate

Definition at line 1578 of file Execute.h.

Referenced by addToQuerySessionList(), checkIsQuerySessionInterrupted(), checkNonKernelTimeInterrupted(), checkPendingQueryStatus(), removeFromQuerySessionList(), and setQuerySessionAsInterrupted().

QuerySessionMap Executor::queries_session_map_

staticprivate

Definition at line 1580 of file Execute.h.

Referenced by addToQuerySessionList(), checkIsQuerySessionEnrolled(), checkPendingQueryStatus(), getExecutorIdsRunningQuery(), getNumCurentSessionsEnrolled(), getQuerySessionInfo(), getQuerySessionStatus(), removeFromQuerySessionList(), updateQuerySessionExecutorAssignment(), and updateQuerySessionStatusWithLock().

QueryPlanDagCache Executor::query_plan_dag_cache_

staticprivate

Definition at line 1604 of file Execute.h.

Referenced by getQueryPlanDagCache().

heavyai::shared_mutex Executor::recycler_mutex_

staticprivate

Definition at line 1605 of file Execute.h.

Referenced by addToCardinalityCache(), clearCardinalityCache(), getCachedCardinality(), getDataRecyclerLock(), getLatestQueryPlanDagExtracted(), and invalidateCardinalityCacheForTable().

std::mutex Executor::register_runtime_extension_functions_mutex_

static

Definition at line 1623 of file Execute.h.

Referenced by get_rt_udf_module(), and registerExtensionFunctions().

ResultSetRecyclerHolder Executor::resultset_recycler_holder_

staticprivate

Definition at line 1608 of file Execute.h.

Referenced by getResultSetRecyclerHolder().

std::shared_ptr<RowSetMemoryOwner> Executor::row_set_mem_owner_

private

Definition at line 1533 of file Execute.h.

Referenced by dumpCache(), executeWorkUnit(), executeWorkUnitImpl(), getRowSetMemoryOwner(), getStringDictionaryProxy(), resultsUnion(), serializeLiterals(), and setupCaching().

std::mutex Executor::str_dict_mutex_

mutableprivate

Definition at line 1545 of file Execute.h.

Referenced by getJoinIntersectionStringProxyTranslationMap(), getStringProxyNumericTranslationMap(), and getStringProxyTranslationMap().

TableGenerations Executor::table_generations_

private

Definition at line 1573 of file Execute.h.

Referenced by clearMetaInfoCache(), dumpCache(), getTableGeneration(), and setupCaching().

TableIdToNodeMap Executor::table_id_to_node_map_

private

Definition at line 1560 of file Execute.h.

const TemporaryTables* Executor::temporary_tables_

private

Definition at line 1559 of file Execute.h.

Referenced by getTemporaryTables().

constexpr ExecutorId Executor::UNITARY_EXECUTOR_ID = 0

static

Definition at line 423 of file Execute.h.

std::unique_ptr<WindowProjectNodeContext> Executor::window_project_node_context_owned_

private

Definition at line 1567 of file Execute.h.

The documentation for this class was generated from the following files:

/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/Execute.h
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/Execute.cpp
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/ExecuteUpdate.cpp
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/GpuInterrupt.cpp
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/IRCodegen.cpp
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/MaxwellCodegenPatch.cpp
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/NativeCodegen.cpp
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/WindowFunctionIR.cpp

Classes

Public Types

Public Member Functions

Static Public Member Functions

Public Attributes

Static Public Attributes

Private Types

Private Member Functions

Static Private Member Functions

Private Attributes

Static Private Attributes

Friends

Detailed Description

Member Typedef Documentation

Member Enumeration Documentation

Constructor & Destructor Documentation

Member Function Documentation

Friends And Related Function Documentation

Member Data Documentation