OmniSciDB  2e3a973ef4
Executor Class Reference

#include <Execute.h>

+ Collaboration diagram for Executor:

Classes

class  FetchCacheAnchor
 
struct  GroupColLLVMValue
 
struct  JoinHashTableOrError
 

Public Types

using ExecutorId = size_t
 
using CachedCardinality = std::pair< bool, size_t >
 

Public Member Functions

 Executor (const ExecutorId id, const size_t block_size_x, const size_t grid_size_x, const size_t max_gpu_slab_size, const std::string &debug_dir, const std::string &debug_file)
 
StringDictionaryProxygetStringDictionaryProxy (const int dictId, const std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const
 
bool isCPUOnly () const
 
bool isArchMaxwell (const ExecutorDeviceType dt) const
 
bool containsLeftDeepOuterJoin () const
 
const ColumnDescriptorgetColumnDescriptor (const Analyzer::ColumnVar *) const
 
const ColumnDescriptorgetPhysicalColumnDescriptor (const Analyzer::ColumnVar *, int) const
 
const Catalog_Namespace::CataloggetCatalog () const
 
void setCatalog (const Catalog_Namespace::Catalog *catalog)
 
const std::shared_ptr< RowSetMemoryOwnergetRowSetMemoryOwner () const
 
const TemporaryTablesgetTemporaryTables () const
 
Fragmenter_Namespace::TableInfo getTableInfo (const int table_id) const
 
const TableGenerationgetTableGeneration (const int table_id) const
 
ExpressionRange getColRange (const PhysicalInput &) const
 
size_t getNumBytesForFetchedRow (const std::set< int > &table_ids_to_fetch) const
 
std::vector< ColumnLazyFetchInfogetColLazyFetchInfo (const std::vector< Analyzer::Expr *> &target_exprs) const
 
void registerActiveModule (void *module, const int device_id) const
 
void unregisterActiveModule (void *module, const int device_id) const
 
void interrupt (const std::string &query_session="", const std::string &interrupt_session="")
 
void resetInterrupt ()
 
void enableRuntimeQueryInterrupt (const unsigned interrupt_freq) const
 
int8_t warpSize () const
 
unsigned gridSize () const
 
unsigned numBlocksPerMP () const
 
unsigned blockSize () const
 
size_t maxGpuSlabSize () const
 
ResultSetPtr executeWorkUnit (size_t &max_groups_buffer_entry_guess, const bool is_agg, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
void executeUpdate (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const UpdateLogForFragment::Callback &cb, const bool is_agg)
 
void setupCaching (const std::unordered_set< PhysicalInput > &phys_inputs, const std::unordered_set< int > &phys_table_ids)
 
void setColRangeCache (const AggregatedColRange &aggregated_col_range)
 
template<typename SESSION_MAP_LOCK >
void setCurrentQuerySession (const std::string &query_session, SESSION_MAP_LOCK &write_lock)
 
template<typename SESSION_MAP_LOCK >
std::string & getCurrentQuerySession (SESSION_MAP_LOCK &read_lock)
 
template<typename SESSION_MAP_LOCK >
bool checkCurrentQuerySession (const std::string &candidate_query_session, SESSION_MAP_LOCK &read_lock)
 
template<typename SESSION_MAP_LOCK >
void invalidateRunningQuerySession (SESSION_MAP_LOCK &write_lock)
 
template<typename SESSION_MAP_LOCK >
bool addToQuerySessionList (const std::string &query_session, const std::string &query_str, SESSION_MAP_LOCK &write_lock)
 
template<typename SESSION_MAP_LOCK >
bool removeFromQuerySessionList (const std::string &query_session, SESSION_MAP_LOCK &write_lock)
 
template<typename SESSION_MAP_LOCK >
void setQuerySessionAsInterrupted (const std::string &query_session, SESSION_MAP_LOCK &write_lock)
 
template<typename SESSION_MAP_LOCK >
bool checkIsQuerySessionInterrupted (const std::string &query_session, SESSION_MAP_LOCK &read_lock)
 
std::optional< QuerySessionStatusgetQuerySessionInfo (const std::string &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
mapd_shared_mutexgetSessionLock ()
 
void addToCardinalityCache (const std::string &cache_key, const size_t cache_value)
 
CachedCardinality getCachedCardinality (const std::string &cache_key)
 
template<typename THREAD_POOL >
void launchKernels (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels)
 
template<>
void setCurrentQuerySession (const std::string &query_session, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
template<>
std::string & getCurrentQuerySession (mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
template<>
bool checkCurrentQuerySession (const std::string &candidate_query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
template<>
void invalidateRunningQuerySession (mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
template<>
bool addToQuerySessionList (const std::string &query_session, const std::string &query_str, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
template<>
bool removeFromQuerySessionList (const std::string &query_session, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
template<>
void setQuerySessionAsInterrupted (const std::string &query_session, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
template<>
bool checkIsQuerySessionInterrupted (const std::string &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 

Static Public Member Functions

static std::shared_ptr< ExecutorgetExecutor (const ExecutorId id, const std::string &debug_dir="", const std::string &debug_file="", const SystemParameters system_parameters=SystemParameters())
 
static void nukeCacheOfExecutors ()
 
static void clearMemory (const Data_Namespace::MemoryLevel memory_level)
 
static size_t getArenaBlockSize ()
 
static std::pair< int64_t, int32_t > reduceResults (const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)
 
static void addCodeToCache (const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
 

Static Public Attributes

static const ExecutorId UNITARY_EXECUTOR_ID = 0
 
static const size_t high_scan_limit {32000000}
 
static const int32_t ERR_DIV_BY_ZERO {1}
 
static const int32_t ERR_OUT_OF_GPU_MEM {2}
 
static const int32_t ERR_OUT_OF_SLOTS {3}
 
static const int32_t ERR_UNSUPPORTED_SELF_JOIN {4}
 
static const int32_t ERR_OUT_OF_RENDER_MEM {5}
 
static const int32_t ERR_OUT_OF_CPU_MEM {6}
 
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW {7}
 
static const int32_t ERR_OUT_OF_TIME {9}
 
static const int32_t ERR_INTERRUPTED {10}
 
static const int32_t ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED {11}
 
static const int32_t ERR_TOO_MANY_LITERALS {12}
 
static const int32_t ERR_STRING_CONST_IN_RESULTSET {13}
 
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY {14}
 
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES {15}
 
static const int32_t ERR_GEOS {16}
 
static std::mutex compilation_mutex_
 
static std::mutex kernel_mutex_
 

Private Types

using PerFragmentCallBack = std::function< void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo &)>
 

Private Member Functions

void clearMetaInfoCache ()
 
int deviceCount (const ExecutorDeviceType) const
 
int deviceCountForMemoryLevel (const Data_Namespace::MemoryLevel memory_level) const
 
llvm::Value * codegenWindowFunction (const size_t target_index, const CompilationOptions &co)
 
llvm::Value * codegenWindowFunctionAggregate (const CompilationOptions &co)
 
llvm::BasicBlock * codegenWindowResetStateControlFlow ()
 
void codegenWindowFunctionStateInit (llvm::Value *aggregate_state)
 
llvm::Value * codegenWindowFunctionAggregateCalls (llvm::Value *aggregate_state, const CompilationOptions &co)
 
void codegenWindowAvgEpilogue (llvm::Value *crt_val, llvm::Value *window_func_null_val, llvm::Value *multiplicity_lv)
 
llvm::Value * codegenAggregateWindowState ()
 
llvm::Value * aggregateWindowStatePtr ()
 
bool isArchPascalOrLater (const ExecutorDeviceType dt) const
 
bool needFetchAllFragments (const InputColDescriptor &col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments) const
 
void executeWorkUnitPerFragment (const RelAlgExecutionUnit &ra_exe_unit, const InputTableInfo &table_info, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, PerFragmentCallBack &cb)
 Compiles and dispatches a work unit per fragment processing results with the per fragment callback. Currently used for computing metrics over fragments (metadata). More...
 
ResultSetPtr executeExplain (const QueryCompilationDescriptor &)
 
ResultSetPtr executeTableFunction (const TableFunctionExecutionUnit exe_unit, const std::vector< InputTableInfo > &table_infos, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat)
 Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps. More...
 
ExecutorDeviceType getDeviceTypeForTargets (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType requested_device_type)
 
ResultSetPtr collectAllDeviceResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
 
ResultSetPtr collectAllDeviceShardedTopResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit) const
 
std::unordered_map< int, const Analyzer::BinOper * > getInnerTabIdToJoinCond () const
 
std::vector< std::unique_ptr< ExecutionKernel > > createKernels (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, ColumnFetcher &column_fetcher, const std::vector< InputTableInfo > &table_infos, const ExecutionOptions &eo, const bool is_agg, const bool allow_single_frag_table_opt, const size_t context_count, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, RenderInfo *render_info, std::unordered_set< int > &available_gpus, int &available_cpus)
 
template<typename THREAD_POOL >
void launchKernels (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels)
 
std::vector< size_t > getTableFragmentIndices (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type, const size_t table_idx, const size_t outer_frag_idx, std::map< int, const TableFragments *> &selected_tables_fragments, const std::unordered_map< int, const Analyzer::BinOper *> &inner_table_id_to_join_condition)
 
bool skipFragmentPair (const Fragmenter_Namespace::FragmentInfo &outer_fragment_info, const Fragmenter_Namespace::FragmentInfo &inner_fragment_info, const int inner_table_id, const std::unordered_map< int, const Analyzer::BinOper *> &inner_table_id_to_join_condition, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
 
FetchResult fetchChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< int, const TableFragments *> &, const FragmentsList &selected_fragments, const Catalog_Namespace::Catalog &, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator)
 
FetchResult fetchUnionChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< int, const TableFragments *> &, const FragmentsList &selected_fragments, const Catalog_Namespace::Catalog &, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator)
 
std::pair< std::vector< std::vector< int64_t > >, std::vector< std::vector< uint64_t > > > getRowCountAndOffsetForAllFrags (const RelAlgExecutionUnit &ra_exe_unit, const CartesianProduct< std::vector< std::vector< size_t >>> &frag_ids_crossjoin, const std::vector< InputDescriptor > &input_descs, const std::map< int, const TableFragments *> &all_tables_fragments)
 
void buildSelectedFragsMapping (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
void buildSelectedFragsMappingForUnion (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< size_t > getFragmentCount (const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
 
int32_t executePlanWithGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const int outer_table_id, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, RenderInfo *render_info)
 
int32_t executePlanWithoutGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const std::vector< Analyzer::Expr *> &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t *>> &col_buffers, QueryExecutionContext *query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, RenderInfo *render_info)
 
ResultSetPtr resultsUnion (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< int64_t > getJoinHashTablePtrs (const ExecutorDeviceType device_type, const int device_id)
 
ResultSetPtr reduceMultiDeviceResults (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceMultiDeviceResultSets (std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceSpeculativeTopN (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr executeWorkUnitImpl (size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
std::vector< llvm::Value * > inlineHoistedLiterals ()
 
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit (const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
 
llvm::BasicBlock * codegenSkipDeletedOuterTableRow (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
std::vector< JoinLoopbuildJoinLoops (RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
 
std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> buildIsDeletedCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
 
std::shared_ptr< JoinHashTableInterfacebuildCurrentLevelHashTable (const JoinCondition &current_level_join_conditions, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
 
void redeclareFilterFunction ()
 
llvm::Value * addJoinLoopIterator (const std::vector< llvm::Value *> &prev_iters, const size_t level_idx)
 
void codegenJoinLoops (const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
 
bool compileBody (const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
 
void createErrorCheckControlFlow (llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type)
 
void preloadFragOffsets (const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
 
JoinHashTableOrError buildHashTableForQualifier (const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinHashTableInterface::HashType preferred_hash_type, ColumnCacheMap &column_cache)
 
void nukeOldState (const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
 
std::shared_ptr< CompilationContextoptimizeAndCodegenCPU (llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function *> &, const CompilationOptions &)
 
std::shared_ptr< CompilationContextoptimizeAndCodegenGPU (llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function *> &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
 
std::string generatePTX (const std::string &) const
 
void initializeNVPTXBackend () const
 
int64_t deviceCycles (int milliseconds) const
 
GroupColLLVMValue groupByColumnCodegen (Analyzer::Expr *group_by_col, const size_t col_width, const CompilationOptions &, const bool translate_null_val, const int64_t translated_null_val, GroupByAndAggregate::DiamondCodegen &, std::stack< llvm::BasicBlock *> &, const bool thread_mem_shared)
 
llvm::Value * castToFP (llvm::Value *val)
 
llvm::Value * castToIntPtrTyIn (llvm::Value *val, const size_t bit_width)
 
std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMapaddDeletedColumn (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
std::pair< bool, int64_t > skipFragment (const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &frag_info, const std::list< std::shared_ptr< Analyzer::Expr >> &simple_quals, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
std::pair< bool, int64_t > skipFragmentInnerJoins (const InputDescriptor &table_desc, const RelAlgExecutionUnit &ra_exe_unit, const Fragmenter_Namespace::FragmentInfo &fragment, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
AggregatedColRange computeColRangesCache (const std::unordered_set< PhysicalInput > &phys_inputs)
 
StringDictionaryGenerations computeStringDictionaryGenerations (const std::unordered_set< PhysicalInput > &phys_inputs)
 
TableGenerations computeTableGenerations (std::unordered_set< int > phys_table_ids)
 
std::shared_ptr< CompilationContextgetCodeFromCache (const CodeCacheKey &, const CodeCache &)
 
std::vector< int8_t > serializeLiterals (const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
 
llvm::Value * spillDoubleElement (llvm::Value *elem_val, llvm::Type *elem_ty)
 

Static Private Member Functions

static size_t align (const size_t off_in, const size_t alignment)
 

Private Attributes

std::unique_ptr< CgenStatecgen_state_
 
std::unique_ptr< PlanStateplan_state_
 
std::shared_ptr< RowSetMemoryOwnerrow_set_mem_owner_
 
std::mutex gpu_exec_mutex_ [max_gpu_count]
 
std::shared_ptr< StringDictionaryProxylit_str_dict_proxy_
 
std::mutex str_dict_mutex_
 
std::unique_ptr< llvm::TargetMachine > nvptx_target_machine_
 
CodeCache cpu_code_cache_
 
CodeCache gpu_code_cache_
 
const unsigned block_size_x_
 
const unsigned grid_size_x_
 
const size_t max_gpu_slab_size_
 
const std::string debug_dir_
 
const std::string debug_file_
 
const ExecutorId executor_id_
 
const Catalog_Namespace::Catalogcatalog_
 
const TemporaryTablestemporary_tables_
 
int64_t kernel_queue_time_ms_ = 0
 
int64_t compilation_queue_time_ms_ = 0
 
std::unique_ptr< WindowProjectNodeContextwindow_project_node_context_owned_
 
WindowFunctionContextactive_window_function_ {nullptr}
 
InputTableInfoCache input_table_info_cache_
 
AggregatedColRange agg_col_range_cache_
 
StringDictionaryGenerations string_dictionary_generations_
 
TableGenerations table_generations_
 

Static Private Attributes

static const int max_gpu_count {16}
 
static std::mutex gpu_active_modules_mutex_
 
static uint32_t gpu_active_modules_device_mask_ {0x0}
 
static void * gpu_active_modules_ [max_gpu_count]
 
static std::atomic< bool > interrupted_ {false}
 
static const size_t baseline_threshold
 
static const size_t code_cache_size {1000}
 
static mapd_shared_mutex executor_session_mutex_
 
static std::string current_query_session_ {""}
 
static InterruptFlagMap queries_interrupt_flag_
 
static QuerySessionMap queries_session_map_
 
static std::map< int, std::shared_ptr< Executor > > executors_
 
static std::atomic_flag execute_spin_lock_ = ATOMIC_FLAG_INIT
 
static mapd_shared_mutex execute_mutex_
 
static mapd_shared_mutex executors_cache_mutex_
 
static mapd_shared_mutex recycler_mutex_
 
static std::unordered_map< std::string, size_t > cardinality_cache_
 

Friends

class BaselineJoinHashTable
 
class CodeGenerator
 
class ColumnFetcher
 
class ExecutionKernel
 
class OverlapsJoinHashTable
 
class GroupByAndAggregate
 
class QueryCompilationDescriptor
 
class QueryMemoryDescriptor
 
class QueryMemoryInitializer
 
class QueryFragmentDescriptor
 
class QueryExecutionContext
 
class ResultSet
 
class InValuesBitmap
 
class JoinHashTable
 
class LeafAggregator
 
class QueryRewriter
 
class PendingExecutionClosure
 
class RelAlgExecutor
 
class TableOptimizer
 
class TableFunctionCompilationContext
 
class TableFunctionExecutionContext
 
struct TargetExprCodegenBuilder
 
struct TargetExprCodegen
 
class WindowProjectNodeContext
 

Detailed Description

Definition at line 329 of file Execute.h.

Member Typedef Documentation

◆ CachedCardinality

using Executor::CachedCardinality = std::pair<bool, size_t>

Definition at line 879 of file Execute.h.

◆ ExecutorId

using Executor::ExecutorId = size_t

Definition at line 336 of file Execute.h.

◆ PerFragmentCallBack

Definition at line 487 of file Execute.h.

Constructor & Destructor Documentation

◆ Executor()

Executor::Executor ( const ExecutorId  id,
const size_t  block_size_x,
const size_t  grid_size_x,
const size_t  max_gpu_slab_size,
const std::string &  debug_dir,
const std::string &  debug_file 
)

Definition at line 131 of file Execute.cpp.

References block_size_x_, catalog_, code_cache_size, cpu_code_cache_, debug_dir_, debug_file_, executor_id_, gpu_code_cache_, grid_size_x_, input_table_info_cache_, max_gpu_slab_size_, and temporary_tables_.

Referenced by WatchdogException::WatchdogException(), and window_function_is_aggregate().

137  : cgen_state_(new CgenState({}, false))
140  , block_size_x_(block_size_x)
141  , grid_size_x_(grid_size_x)
142  , max_gpu_slab_size_(max_gpu_slab_size)
143  , debug_dir_(debug_dir)
144  , debug_file_(debug_file)
145  , executor_id_(executor_id)
146  , catalog_(nullptr)
147  , temporary_tables_(nullptr)
148  , input_table_info_cache_(this) {}
const std::string debug_dir_
Definition: Execute.h:940
static const size_t code_cache_size
Definition: Execute.h:935
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
CodeCache gpu_code_cache_
Definition: Execute.h:931
const ExecutorId executor_id_
Definition: Execute.h:943
const size_t max_gpu_slab_size_
Definition: Execute.h:939
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
const std::string debug_file_
Definition: Execute.h:941
const unsigned block_size_x_
Definition: Execute.h:937
const unsigned grid_size_x_
Definition: Execute.h:938
InputTableInfoCache input_table_info_cache_
Definition: Execute.h:956
CodeCache cpu_code_cache_
Definition: Execute.h:930
const TemporaryTables * temporary_tables_
Definition: Execute.h:945
+ Here is the caller graph for this function:

Member Function Documentation

◆ addCodeToCache()

void Executor::addCodeToCache ( const CodeCacheKey key,
std::shared_ptr< CompilationContext compilation_context,
llvm::Module *  module,
CodeCache cache 
)
static

Definition at line 224 of file NativeCodegen.cpp.

References LruCache< key_t, value_t, hash_t >::put().

Referenced by ResultSetReductionJIT::finalizeReductionCode(), and StubGenerator::generateStub().

227  {
228  cache.put(key,
229  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
230  std::move(compilation_context), std::move(module)));
231 }
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ addDeletedColumn()

std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > Executor::addDeletedColumn ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 3195 of file Execute.cpp.

References catalog_, CHECK, CHECK_EQ, CompilationOptions::filter_on_deleted_column, Catalog_Namespace::Catalog::getDeletedColumnIfRowsDeleted(), Catalog_Namespace::Catalog::getMetadataForTable(), and TABLE.

Referenced by executeWorkUnitImpl(), and executeWorkUnitPerFragment().

3197  {
3198  if (!co.filter_on_deleted_column) {
3199  return std::make_tuple(ra_exe_unit, PlanState::DeletedColumnsMap{});
3200  }
3201  auto ra_exe_unit_with_deleted = ra_exe_unit;
3202  PlanState::DeletedColumnsMap deleted_cols_map;
3203  for (const auto& input_table : ra_exe_unit_with_deleted.input_descs) {
3204  if (input_table.getSourceType() != InputSourceType::TABLE) {
3205  continue;
3206  }
3207  const auto td = catalog_->getMetadataForTable(input_table.getTableId());
3208  CHECK(td);
3209  const auto deleted_cd = catalog_->getDeletedColumnIfRowsDeleted(td);
3210  if (!deleted_cd) {
3211  continue;
3212  }
3213  CHECK(deleted_cd->columnType.is_boolean());
3214  // check deleted column is not already present
3215  bool found = false;
3216  for (const auto& input_col : ra_exe_unit_with_deleted.input_col_descs) {
3217  if (input_col.get()->getColId() == deleted_cd->columnId &&
3218  input_col.get()->getScanDesc().getTableId() == deleted_cd->tableId &&
3219  input_col.get()->getScanDesc().getNestLevel() == input_table.getNestLevel()) {
3220  found = true;
3221  }
3222  }
3223  if (!found) {
3224  // add deleted column
3225  ra_exe_unit_with_deleted.input_col_descs.emplace_back(new InputColDescriptor(
3226  deleted_cd->columnId, deleted_cd->tableId, input_table.getNestLevel()));
3227  auto deleted_cols_it = deleted_cols_map.find(deleted_cd->tableId);
3228  if (deleted_cols_it == deleted_cols_map.end()) {
3229  CHECK(deleted_cols_map.insert(std::make_pair(deleted_cd->tableId, deleted_cd))
3230  .second);
3231  } else {
3232  CHECK_EQ(deleted_cd, deleted_cols_it->second);
3233  }
3234  }
3235  }
3236  return std::make_tuple(ra_exe_unit_with_deleted, deleted_cols_map);
3237 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
const ColumnDescriptor * getDeletedColumnIfRowsDeleted(const TableDescriptor *td) const
Definition: Catalog.cpp:2856
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ addJoinLoopIterator()

llvm::Value * Executor::addJoinLoopIterator ( const std::vector< llvm::Value *> &  prev_iters,
const size_t  level_idx 
)
private

Definition at line 635 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, and CgenState::scan_idx_to_hash_pos_.

636  {
638  // Iterators are added for loop-outer joins when the head of the loop is generated,
639  // then once again when the body if generated. Allow this instead of special handling
640  // of call sites.
641  const auto it = cgen_state_->scan_idx_to_hash_pos_.find(level_idx);
642  if (it != cgen_state_->scan_idx_to_hash_pos_.end()) {
643  return it->second;
644  }
645  CHECK(!prev_iters.empty());
646  llvm::Value* matching_row_index = prev_iters.back();
647  const auto it_ok =
648  cgen_state_->scan_idx_to_hash_pos_.emplace(level_idx, matching_row_index);
649  CHECK(it_ok.second);
650  return matching_row_index;
651 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:197

◆ addToCardinalityCache()

void Executor::addToCardinalityCache ( const std::string &  cache_key,
const size_t  cache_value 
)

Definition at line 3617 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, recycler_mutex_, and VLOG.

3618  {
3620  mapd_unique_lock<mapd_shared_mutex> lock(recycler_mutex_);
3621  cardinality_cache_[cache_key] = cache_value;
3622  VLOG(1) << "Put estimated cardinality to the cache";
3623  }
3624 }
static std::unordered_map< std::string, size_t > cardinality_cache_
Definition: Execute.h:978
bool g_use_estimator_result_cache
Definition: Execute.cpp:109
static mapd_shared_mutex recycler_mutex_
Definition: Execute.h:977
#define VLOG(n)
Definition: Logger.h:291

◆ addToQuerySessionList() [1/2]

template<typename SESSION_MAP_LOCK >
bool Executor::addToQuerySessionList ( const std::string &  query_session,
const std::string &  query_str,
SESSION_MAP_LOCK &  write_lock 
)

◆ addToQuerySessionList() [2/2]

template<>
bool Executor::addToQuerySessionList ( const std::string &  query_session,
const std::string &  query_str,
mapd_unique_lock< mapd_shared_mutex > &  write_lock 
)

Definition at line 3574 of file Execute.cpp.

References queries_interrupt_flag_, and queries_session_map_.

3576  {
3577  queries_session_map_.emplace(
3578  query_session,
3579  QuerySessionStatus(query_session, query_str, std::chrono::system_clock::now()));
3580  return queries_interrupt_flag_.emplace(query_session, false).second;
3581 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:965
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:963

◆ aggregateWindowStatePtr()

llvm::Value * Executor::aggregateWindowStatePtr ( )
private

Definition at line 124 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), and kFLOAT.

124  {
126  const auto window_func_context =
128  const auto window_func = window_func_context->getWindowFunction();
129  const auto arg_ti = get_adjusted_window_type_info(window_func);
130  llvm::Type* aggregate_state_type =
131  arg_ti.get_type() == kFLOAT
132  ? llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0)
133  : llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
134  const auto aggregate_state_i64 = cgen_state_->llInt(
135  reinterpret_cast<const int64_t>(window_func_context->aggregateState()));
136  return cgen_state_->ir_builder_.CreateIntToPtr(aggregate_state_i64,
137  aggregate_state_type);
138 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)
+ Here is the call graph for this function:

◆ align()

static size_t Executor::align ( const size_t  off_in,
const size_t  alignment 
)
inlinestaticprivate

Definition at line 891 of file Execute.h.

Referenced by serializeLiterals().

891  {
892  size_t off = off_in;
893  if (off % alignment != 0) {
894  off += (alignment - off % alignment);
895  }
896  return off;
897  }
+ Here is the caller graph for this function:

◆ blockSize()

unsigned Executor::blockSize ( ) const

Definition at line 3123 of file Execute.cpp.

References block_size_x_, catalog_, CHECK, Data_Namespace::DataMgr::getCudaMgr(), and Catalog_Namespace::Catalog::getDataMgr().

Referenced by executePlanWithGroupBy(), and executePlanWithoutGroupBy().

3123  {
3124  CHECK(catalog_);
3125  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
3126  CHECK(cuda_mgr);
3127  const auto& dev_props = cuda_mgr->getAllDeviceProperties();
3128  return block_size_x_ ? block_size_x_ : dev_props.front().maxThreadsPerBlock;
3129 }
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:209
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
const unsigned block_size_x_
Definition: Execute.h:937
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:208
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ buildCurrentLevelHashTable()

std::shared_ptr< JoinHashTableInterface > Executor::buildCurrentLevelHashTable ( const JoinCondition current_level_join_conditions,
RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache,
std::vector< std::string > &  fail_reasons 
)
private

Definition at line 490 of file IRCodegen.cpp.

References anonymous_namespace{IRCodegen.cpp}::add_qualifier_to_execution_unit(), AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, Data_Namespace::CPU_LEVEL, CompilationOptions::device_type, Executor::JoinHashTableOrError::fail_reason, GPU, Data_Namespace::GPU_LEVEL, Executor::JoinHashTableOrError::hash_table, INNER, IS_EQUIVALENCE, PlanState::join_info_, JoinHashTableInterface::OneToOne, CodeGenerator::plan_state_, JoinCondition::quals, and JoinCondition::type.

496  {
498  if (current_level_join_conditions.type != JoinType::INNER &&
499  current_level_join_conditions.quals.size() > 1) {
500  fail_reasons.emplace_back("No equijoin expression found for outer join");
501  return nullptr;
502  }
503  std::shared_ptr<JoinHashTableInterface> current_level_hash_table;
504  for (const auto& join_qual : current_level_join_conditions.quals) {
505  auto qual_bin_oper = std::dynamic_pointer_cast<Analyzer::BinOper>(join_qual);
506  if (!qual_bin_oper || !IS_EQUIVALENCE(qual_bin_oper->get_optype())) {
507  fail_reasons.emplace_back("No equijoin expression found");
508  if (current_level_join_conditions.type == JoinType::INNER) {
509  add_qualifier_to_execution_unit(ra_exe_unit, join_qual);
510  }
511  continue;
512  }
513  JoinHashTableOrError hash_table_or_error;
514  if (!current_level_hash_table) {
515  hash_table_or_error = buildHashTableForQualifier(
516  qual_bin_oper,
517  query_infos,
521  column_cache);
522  current_level_hash_table = hash_table_or_error.hash_table;
523  }
524  if (hash_table_or_error.hash_table) {
525  plan_state_->join_info_.join_hash_tables_.push_back(hash_table_or_error.hash_table);
526  plan_state_->join_info_.equi_join_tautologies_.push_back(qual_bin_oper);
527  } else {
528  fail_reasons.push_back(hash_table_or_error.fail_reason);
529  if (current_level_join_conditions.type == JoinType::INNER) {
530  add_qualifier_to_execution_unit(ra_exe_unit, qual_bin_oper);
531  }
532  }
533  }
534  return current_level_hash_table;
535 }
#define IS_EQUIVALENCE(X)
Definition: sqldefs.h:67
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:914
void add_qualifier_to_execution_unit(RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< Analyzer::Expr > &qual)
Definition: IRCodegen.cpp:228
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
std::list< std::shared_ptr< Analyzer::Expr > > quals
JoinHashTableOrError buildHashTableForQualifier(const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinHashTableInterface::HashType preferred_hash_type, ColumnCacheMap &column_cache)
Definition: Execute.cpp:3069
+ Here is the call graph for this function:

◆ buildHashTableForQualifier()

Executor::JoinHashTableOrError Executor::buildHashTableForQualifier ( const std::shared_ptr< Analyzer::BinOper > &  qual_bin_oper,
const std::vector< InputTableInfo > &  query_infos,
const MemoryLevel  memory_level,
const JoinHashTableInterface::HashType  preferred_hash_type,
ColumnCacheMap column_cache 
)
private

Definition at line 3069 of file Execute.cpp.

References deviceCountForMemoryLevel(), ERR_INTERRUPTED, g_enable_dynamic_watchdog, g_enable_overlaps_hashjoin, g_enable_runtime_query_interrupt, JoinHashTableInterface::getInstance(), interrupted_, and resetInterrupt().

3074  {
3075  if (!g_enable_overlaps_hashjoin && qual_bin_oper->is_overlaps_oper()) {
3076  return {nullptr, "Overlaps hash join disabled, attempting to fall back to loop join"};
3077  }
3078  // check whether the interrupt flag turns on (non kernel-time query interrupt)
3080  interrupted_.load()) {
3081  resetInterrupt();
3083  }
3084  try {
3085  auto tbl =
3087  query_infos,
3088  memory_level,
3089  preferred_hash_type,
3090  deviceCountForMemoryLevel(memory_level),
3091  column_cache,
3092  this);
3093  return {tbl, ""};
3094  } catch (const HashJoinFail& e) {
3095  return {nullptr, e.what()};
3096  }
3097 }
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:989
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:75
bool g_enable_overlaps_hashjoin
Definition: Execute.cpp:92
static std::shared_ptr< JoinHashTableInterface > getInstance(const std::shared_ptr< Analyzer::BinOper > qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor)
Make hash table from an in-flight SQL query&#39;s parse tree etc.
int deviceCountForMemoryLevel(const Data_Namespace::MemoryLevel memory_level) const
Definition: Execute.cpp:626
void resetInterrupt()
static std::atomic< bool > interrupted_
Definition: Execute.h:923
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:108
+ Here is the call graph for this function:

◆ buildIsDeletedCb()

std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> Executor::buildIsDeletedCb ( const RelAlgExecutionUnit ra_exe_unit,
const size_t  level_idx,
const CompilationOptions co 
)
private

Definition at line 431 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_LT, CodeGenerator::codegen(), CgenState::context_, CgenState::current_func_, CompilationOptions::filter_on_deleted_column, PlanState::getDeletedColForTable(), RelAlgExecutionUnit::input_descs, CgenState::ir_builder_, CgenState::llBool(), CgenState::llInt(), CodeGenerator::plan_state_, TABLE, and CodeGenerator::toBool().

433  {
435  if (!co.filter_on_deleted_column) {
436  return nullptr;
437  }
438  CHECK_LT(level_idx + 1, ra_exe_unit.input_descs.size());
439  const auto input_desc = ra_exe_unit.input_descs[level_idx + 1];
440  if (input_desc.getSourceType() != InputSourceType::TABLE) {
441  return nullptr;
442  }
443 
444  const auto deleted_cd = plan_state_->getDeletedColForTable(input_desc.getTableId());
445  if (!deleted_cd) {
446  return nullptr;
447  }
448  CHECK(deleted_cd->columnType.is_boolean());
449  const auto deleted_expr = makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
450  input_desc.getTableId(),
451  deleted_cd->columnId,
452  input_desc.getNestLevel());
453  return [this, deleted_expr, level_idx, &co](const std::vector<llvm::Value*>& prev_iters,
454  llvm::Value* have_more_inner_rows) {
455  const auto matching_row_index = addJoinLoopIterator(prev_iters, level_idx + 1);
456  // Avoid fetching the deleted column from a position which is not valid.
457  // An invalid position can be returned by a one to one hash lookup (negative)
458  // or at the end of iteration over a set of matching values.
459  llvm::Value* is_valid_it{nullptr};
460  if (have_more_inner_rows) {
461  is_valid_it = have_more_inner_rows;
462  } else {
463  is_valid_it = cgen_state_->ir_builder_.CreateICmp(
464  llvm::ICmpInst::ICMP_SGE, matching_row_index, cgen_state_->llInt<int64_t>(0));
465  }
466  const auto it_valid_bb = llvm::BasicBlock::Create(
467  cgen_state_->context_, "it_valid", cgen_state_->current_func_);
468  const auto it_not_valid_bb = llvm::BasicBlock::Create(
469  cgen_state_->context_, "it_not_valid", cgen_state_->current_func_);
470  cgen_state_->ir_builder_.CreateCondBr(is_valid_it, it_valid_bb, it_not_valid_bb);
471  const auto row_is_deleted_bb = llvm::BasicBlock::Create(
472  cgen_state_->context_, "row_is_deleted", cgen_state_->current_func_);
473  cgen_state_->ir_builder_.SetInsertPoint(it_valid_bb);
474  CodeGenerator code_generator(this);
475  const auto row_is_deleted = code_generator.toBool(
476  code_generator.codegen(deleted_expr.get(), true, co).front());
477  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
478  cgen_state_->ir_builder_.SetInsertPoint(it_not_valid_bb);
479  const auto row_is_deleted_default = cgen_state_->llBool(false);
480  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
481  cgen_state_->ir_builder_.SetInsertPoint(row_is_deleted_bb);
482  auto row_is_deleted_or_default =
483  cgen_state_->ir_builder_.CreatePHI(row_is_deleted->getType(), 2);
484  row_is_deleted_or_default->addIncoming(row_is_deleted, it_valid_bb);
485  row_is_deleted_or_default->addIncoming(row_is_deleted_default, it_not_valid_bb);
486  return row_is_deleted_or_default;
487  };
488 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:914
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK(condition)
Definition: Logger.h:197
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value *> &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:635
+ Here is the call graph for this function:

◆ buildJoinLoops()

std::vector< JoinLoop > Executor::buildJoinLoops ( RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache 
)
private

Definition at line 260 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, anonymous_namespace{IRCodegen.cpp}::check_if_loop_join_is_allowed(), CHECK_LT, CodeGenerator::codegen(), get_arg_by_name(), INJECT_TIMER, CgenState::ir_builder_, join(), RelAlgExecutionUnit::join_quals, LEFT, CgenState::llBool(), CgenState::llInt(), JoinHashTableInterface::OneToOne, CgenState::outer_join_match_found_per_level_, CgenState::row_func_, Set, Singleton, CodeGenerator::toBool(), JoinCondition::type, UpperBound, and VLOG.

265  {
268  std::vector<JoinLoop> join_loops;
269  for (size_t level_idx = 0, current_hash_table_idx = 0;
270  level_idx < ra_exe_unit.join_quals.size();
271  ++level_idx) {
272  const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
273  std::vector<std::string> fail_reasons;
274  const auto build_cur_level_hash_table = [&]() {
275  if (current_level_join_conditions.quals.size() > 1) {
276  const auto first_qual = *current_level_join_conditions.quals.begin();
277  auto qual_bin_oper =
278  std::dynamic_pointer_cast<const Analyzer::BinOper>(first_qual);
279  if (qual_bin_oper && qual_bin_oper->is_overlaps_oper() &&
280  current_level_join_conditions.type == JoinType::LEFT) {
281  JoinCondition join_condition{{first_qual}, current_level_join_conditions.type};
282 
284  join_condition, ra_exe_unit, co, query_infos, column_cache, fail_reasons);
285  }
286  }
287  return buildCurrentLevelHashTable(current_level_join_conditions,
288  ra_exe_unit,
289  co,
290  query_infos,
291  column_cache,
292  fail_reasons);
293  };
294  const auto current_level_hash_table = build_cur_level_hash_table();
295  const auto found_outer_join_matches_cb =
296  [this, level_idx](llvm::Value* found_outer_join_matches) {
297  CHECK_LT(level_idx, cgen_state_->outer_join_match_found_per_level_.size());
298  CHECK(!cgen_state_->outer_join_match_found_per_level_[level_idx]);
299  cgen_state_->outer_join_match_found_per_level_[level_idx] =
300  found_outer_join_matches;
301  };
302  const auto is_deleted_cb = buildIsDeletedCb(ra_exe_unit, level_idx, co);
303  const auto outer_join_condition_multi_quals_cb =
304  [this, level_idx, &co, &current_level_join_conditions](
305  const std::vector<llvm::Value*>& prev_iters) {
306  // The values generated for the match path don't dominate all uses
307  // since on the non-match path nulls are generated. Reset the cache
308  // once the condition is generated to avoid incorrect reuse.
309  FetchCacheAnchor anchor(cgen_state_.get());
310  addJoinLoopIterator(prev_iters, level_idx + 1);
311  llvm::Value* left_join_cond = cgen_state_->llBool(true);
312  CodeGenerator code_generator(this);
313  // Do not want to look at all quals! only 1..N quals (ignore first qual)
314  // Note(jclay): this may need to support cases larger than 2
315  // are there any?
316  if (current_level_join_conditions.quals.size() >= 2) {
317  auto qual_it = std::next(current_level_join_conditions.quals.begin(), 1);
318  for (; qual_it != current_level_join_conditions.quals.end();
319  std::advance(qual_it, 1)) {
320  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
321  left_join_cond,
322  code_generator.toBool(
323  code_generator.codegen(qual_it->get(), true, co).front()));
324  }
325  }
326  return left_join_cond;
327  };
328  if (current_level_hash_table) {
329  if (current_level_hash_table->getHashType() == JoinHashTable::HashType::OneToOne) {
330  join_loops.emplace_back(
331  /*kind=*/JoinLoopKind::Singleton,
332  /*type=*/current_level_join_conditions.type,
333  /*iteration_domain_codegen=*/
334  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
335  const std::vector<llvm::Value*>& prev_iters) {
336  addJoinLoopIterator(prev_iters, level_idx);
337  JoinLoopDomain domain{{0}};
338  domain.slot_lookup_result =
339  current_level_hash_table->codegenSlot(co, current_hash_table_idx);
340  return domain;
341  },
342  /*outer_condition_match=*/nullptr,
343  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
344  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
345  : nullptr,
346  /*is_deleted=*/is_deleted_cb);
347  } else {
348  join_loops.emplace_back(
349  /*kind=*/JoinLoopKind::Set,
350  /*type=*/current_level_join_conditions.type,
351  /*iteration_domain_codegen=*/
352  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
353  const std::vector<llvm::Value*>& prev_iters) {
354  addJoinLoopIterator(prev_iters, level_idx);
355  JoinLoopDomain domain{{0}};
356  const auto matching_set = current_level_hash_table->codegenMatchingSet(
357  co, current_hash_table_idx);
358  domain.values_buffer = matching_set.elements;
359  domain.element_count = matching_set.count;
360  return domain;
361  },
362  /*outer_condition_match=*/
363  current_level_join_conditions.type == JoinType::LEFT
364  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
365  outer_join_condition_multi_quals_cb)
366  : nullptr,
367  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
368  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
369  : nullptr,
370  /*is_deleted=*/is_deleted_cb);
371  }
372  ++current_hash_table_idx;
373  } else {
374  const auto fail_reasons_str = current_level_join_conditions.quals.empty()
375  ? "No equijoin expression found"
376  : boost::algorithm::join(fail_reasons, " | ");
378  ra_exe_unit, eo, query_infos, level_idx, fail_reasons_str);
379  // Callback provided to the `JoinLoop` framework to evaluate the (outer) join
380  // condition.
381  VLOG(1) << "Unable to build hash table, falling back to loop join: "
382  << fail_reasons_str;
383  const auto outer_join_condition_cb =
384  [this, level_idx, &co, &current_level_join_conditions](
385  const std::vector<llvm::Value*>& prev_iters) {
386  // The values generated for the match path don't dominate all uses
387  // since on the non-match path nulls are generated. Reset the cache
388  // once the condition is generated to avoid incorrect reuse.
389  FetchCacheAnchor anchor(cgen_state_.get());
390  addJoinLoopIterator(prev_iters, level_idx + 1);
391  llvm::Value* left_join_cond = cgen_state_->llBool(true);
392  CodeGenerator code_generator(this);
393  for (auto expr : current_level_join_conditions.quals) {
394  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
395  left_join_cond,
396  code_generator.toBool(
397  code_generator.codegen(expr.get(), true, co).front()));
398  }
399  return left_join_cond;
400  };
401  join_loops.emplace_back(
402  /*kind=*/JoinLoopKind::UpperBound,
403  /*type=*/current_level_join_conditions.type,
404  /*iteration_domain_codegen=*/
405  [this, level_idx](const std::vector<llvm::Value*>& prev_iters) {
406  addJoinLoopIterator(prev_iters, level_idx);
407  JoinLoopDomain domain{{0}};
408  const auto rows_per_scan_ptr = cgen_state_->ir_builder_.CreateGEP(
409  get_arg_by_name(cgen_state_->row_func_, "num_rows_per_scan"),
410  cgen_state_->llInt(int32_t(level_idx + 1)));
411  domain.upper_bound = cgen_state_->ir_builder_.CreateLoad(rows_per_scan_ptr,
412  "num_rows_per_scan");
413  return domain;
414  },
415  /*outer_condition_match=*/
416  current_level_join_conditions.type == JoinType::LEFT
417  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
418  outer_join_condition_cb)
419  : nullptr,
420  /*found_outer_matches=*/
421  current_level_join_conditions.type == JoinType::LEFT
422  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
423  : nullptr,
424  /*is_deleted=*/is_deleted_cb);
425  }
426  }
427  return join_loops;
428 }
std::string join(T const &container, std::string const &delim)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:129
#define INJECT_TIMER(DESC)
Definition: measure.h:93
const JoinQualsPerNestingLevel join_quals
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK(condition)
Definition: Logger.h:197
void check_if_loop_join_is_allowed(RelAlgExecutionUnit &ra_exe_unit, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, const size_t level_idx, const std::string &fail_reason)
Definition: IRCodegen.cpp:238
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:260
std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> buildIsDeletedCb(const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
Definition: IRCodegen.cpp:431
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value *> &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:635
#define VLOG(n)
Definition: Logger.h:291
std::shared_ptr< JoinHashTableInterface > buildCurrentLevelHashTable(const JoinCondition &current_level_join_conditions, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
Definition: IRCodegen.cpp:490
+ Here is the call graph for this function:

◆ buildSelectedFragsMapping()

void Executor::buildSelectedFragsMapping ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 2564 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LT, getFragmentCount(), RelAlgExecutionUnit::input_descs, and plan_state_.

Referenced by fetchChunks().

2569  {
2570  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
2571  size_t frag_pos{0};
2572  const auto& input_descs = ra_exe_unit.input_descs;
2573  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
2574  const int table_id = input_descs[scan_idx].getTableId();
2575  CHECK_EQ(selected_fragments[scan_idx].table_id, table_id);
2576  selected_fragments_crossjoin.push_back(
2577  getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
2578  for (const auto& col_id : col_global_ids) {
2579  CHECK(col_id);
2580  const auto& input_desc = col_id->getScanDesc();
2581  if (input_desc.getTableId() != table_id ||
2582  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
2583  continue;
2584  }
2585  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
2586  CHECK(it != plan_state_->global_to_local_col_ids_.end());
2587  CHECK_LT(static_cast<size_t>(it->second),
2588  plan_state_->global_to_local_col_ids_.size());
2589  local_col_to_frag_pos[it->second] = frag_pos;
2590  }
2591  ++frag_pos;
2592  }
2593 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:914
#define CHECK_LT(x, y)
Definition: Logger.h:207
std::vector< size_t > getFragmentCount(const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:2550
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ buildSelectedFragsMappingForUnion()

void Executor::buildSelectedFragsMappingForUnion ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 2595 of file Execute.cpp.

References CHECK, CHECK_LT, RelAlgExecutionUnit::input_descs, and plan_state_.

Referenced by fetchUnionChunks().

2600  {
2601  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
2602  size_t frag_pos{0};
2603  const auto& input_descs = ra_exe_unit.input_descs;
2604  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
2605  const int table_id = input_descs[scan_idx].getTableId();
2606  // selected_fragments here is from assignFragsToKernelDispatch
2607  // execution_kernel.fragments
2608  if (selected_fragments[0].table_id != table_id) { // TODO 0
2609  continue;
2610  }
2611  // CHECK_EQ(selected_fragments[scan_idx].table_id, table_id);
2612  selected_fragments_crossjoin.push_back(
2613  // getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
2614  {size_t(1)}); // TODO
2615  for (const auto& col_id : col_global_ids) {
2616  CHECK(col_id);
2617  const auto& input_desc = col_id->getScanDesc();
2618  if (input_desc.getTableId() != table_id ||
2619  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
2620  continue;
2621  }
2622  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
2623  CHECK(it != plan_state_->global_to_local_col_ids_.end());
2624  CHECK_LT(static_cast<size_t>(it->second),
2625  plan_state_->global_to_local_col_ids_.size());
2626  local_col_to_frag_pos[it->second] = frag_pos;
2627  }
2628  ++frag_pos;
2629  }
2630 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:914
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the caller graph for this function:

◆ castToFP()

llvm::Value * Executor::castToFP ( llvm::Value *  val)
private

Definition at line 3143 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, cgen_state_, logger::FATAL, LOG, and to_string().

3143  {
3145  if (!val->getType()->isIntegerTy()) {
3146  return val;
3147  }
3148 
3149  auto val_width = static_cast<llvm::IntegerType*>(val->getType())->getBitWidth();
3150  llvm::Type* dest_ty{nullptr};
3151  switch (val_width) {
3152  case 32:
3153  dest_ty = llvm::Type::getFloatTy(cgen_state_->context_);
3154  break;
3155  case 64:
3156  dest_ty = llvm::Type::getDoubleTy(cgen_state_->context_);
3157  break;
3158  default:
3159  LOG(FATAL) << "Unsupported FP width: " << std::to_string(val_width);
3160  }
3161  return cgen_state_->ir_builder_.CreateSIToFP(val, dest_ty);
3162 }
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
std::string to_string(char const *&&v)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
+ Here is the call graph for this function:

◆ castToIntPtrTyIn()

llvm::Value * Executor::castToIntPtrTyIn ( llvm::Value *  val,
const size_t  bit_width 
)
private

Definition at line 3164 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, cgen_state_, CHECK, CHECK_LT, and get_int_type().

3164  {
3166  CHECK(val->getType()->isPointerTy());
3167 
3168  const auto val_ptr_type = static_cast<llvm::PointerType*>(val->getType());
3169  const auto val_type = val_ptr_type->getElementType();
3170  size_t val_width = 0;
3171  if (val_type->isIntegerTy()) {
3172  val_width = val_type->getIntegerBitWidth();
3173  } else {
3174  if (val_type->isFloatTy()) {
3175  val_width = 32;
3176  } else {
3177  CHECK(val_type->isDoubleTy());
3178  val_width = 64;
3179  }
3180  }
3181  CHECK_LT(size_t(0), val_width);
3182  if (bitWidth == val_width) {
3183  return val;
3184  }
3185  return cgen_state_->ir_builder_.CreateBitCast(
3186  val, llvm::PointerType::get(get_int_type(bitWidth, cgen_state_->context_), 0));
3187 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:

◆ checkCurrentQuerySession() [1/2]

template<typename SESSION_MAP_LOCK >
bool Executor::checkCurrentQuerySession ( const std::string &  candidate_query_session,
SESSION_MAP_LOCK &  read_lock 
)

◆ checkCurrentQuerySession() [2/2]

template<>
bool Executor::checkCurrentQuerySession ( const std::string &  candidate_query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 3560 of file Execute.cpp.

References current_query_session_.

3561  {
3562  // if current_query_session is equal to the candidate_query_session,
3563  // or it is empty session we consider
3564  return (current_query_session_ == candidate_query_session);
3565 }
static std::string current_query_session_
Definition: Execute.h:961

◆ checkIsQuerySessionInterrupted() [1/2]

template<typename SESSION_MAP_LOCK >
bool Executor::checkIsQuerySessionInterrupted ( const std::string &  query_session,
SESSION_MAP_LOCK &  read_lock 
)

◆ checkIsQuerySessionInterrupted() [2/2]

template<>
bool Executor::checkIsQuerySessionInterrupted ( const std::string &  query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 3601 of file Execute.cpp.

References queries_interrupt_flag_.

3603  {
3604  auto flag_it = queries_interrupt_flag_.find(query_session);
3605  return flag_it != queries_interrupt_flag_.end() && flag_it->second;
3606 }
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:963

◆ clearMemory()

void Executor::clearMemory ( const Data_Namespace::MemoryLevel  memory_level)
static

Definition at line 172 of file Execute.cpp.

References Data_Namespace::DataMgr::clearMemory(), Data_Namespace::CPU_LEVEL, execute_mutex_, Catalog_Namespace::SysCatalog::getDataMgr(), Data_Namespace::GPU_LEVEL, Catalog_Namespace::SysCatalog::instance(), and CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCaches().

Referenced by DBHandler::clear_cpu_memory(), DBHandler::clear_gpu_memory(), QueryRunner::QueryRunner::clearCpuMemory(), and QueryRunner::QueryRunner::clearGpuMemory().

172  {
173  switch (memory_level) {
176  mapd_unique_lock<mapd_shared_mutex> flush_lock(
177  execute_mutex_); // Don't flush memory while queries are running
178 
180  if (memory_level == Data_Namespace::MemoryLevel::CPU_LEVEL) {
181  // The hash table cache uses CPU memory not managed by the buffer manager. In the
182  // future, we should manage these allocations with the buffer manager directly.
183  // For now, assume the user wants to purge the hash table cache when they clear
184  // CPU memory (currently used in ExecuteTest to lower memory pressure)
186  }
187  break;
188  }
189  default: {
190  throw std::runtime_error(
191  "Clearing memory levels other than the CPU level or GPU level is not "
192  "supported.");
193  }
194  }
195 }
static mapd_shared_mutex execute_mutex_
Definition: Execute.h:972
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:187
void clearMemory(const MemoryLevel memLevel)
Definition: DataMgr.cpp:377
static void invalidateCaches()
static SysCatalog & instance()
Definition: SysCatalog.h:286
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ clearMetaInfoCache()

void Executor::clearMetaInfoCache ( )
private

Definition at line 347 of file Execute.cpp.

References agg_col_range_cache_, StringDictionaryGenerations::clear(), TableGenerations::clear(), AggregatedColRange::clear(), InputTableInfoCache::clear(), input_table_info_cache_, string_dictionary_generations_, and table_generations_.

347  {
352 }
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:957
StringDictionaryGenerations string_dictionary_generations_
Definition: Execute.h:958
InputTableInfoCache input_table_info_cache_
Definition: Execute.h:956
TableGenerations table_generations_
Definition: Execute.h:959
+ Here is the call graph for this function:

◆ codegenAggregateWindowState()

llvm::Value * Executor::codegenAggregateWindowState ( )
private

Definition at line 326 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), Analyzer::WindowFunction::getKind(), kDECIMAL, kDOUBLE, and kFLOAT.

326  {
328  const auto pi32_type =
329  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
330  const auto pi64_type =
331  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
332  const auto window_func_context =
334  const Analyzer::WindowFunction* window_func = window_func_context->getWindowFunction();
335  const auto window_func_ti = get_adjusted_window_type_info(window_func);
336  const auto aggregate_state_type =
337  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
338  auto aggregate_state = aggregateWindowStatePtr();
339  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
340  const auto aggregate_state_count_i64 = cgen_state_->llInt(
341  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
342  auto aggregate_state_count = cgen_state_->ir_builder_.CreateIntToPtr(
343  aggregate_state_count_i64, aggregate_state_type);
344  const auto double_null_lv = cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE));
345  switch (window_func_ti.get_type()) {
346  case kFLOAT: {
347  return cgen_state_->emitCall(
348  "load_avg_float", {aggregate_state, aggregate_state_count, double_null_lv});
349  }
350  case kDOUBLE: {
351  return cgen_state_->emitCall(
352  "load_avg_double", {aggregate_state, aggregate_state_count, double_null_lv});
353  }
354  case kDECIMAL: {
355  return cgen_state_->emitCall(
356  "load_avg_decimal",
357  {aggregate_state,
358  aggregate_state_count,
359  double_null_lv,
360  cgen_state_->llInt<int32_t>(window_func_ti.get_scale())});
361  }
362  default: {
363  return cgen_state_->emitCall(
364  "load_avg_int", {aggregate_state, aggregate_state_count, double_null_lv});
365  }
366  }
367  }
368  if (window_func->getKind() == SqlWindowFunctionKind::COUNT) {
369  return cgen_state_->ir_builder_.CreateLoad(aggregate_state);
370  }
371  switch (window_func_ti.get_type()) {
372  case kFLOAT: {
373  return cgen_state_->emitCall("load_float", {aggregate_state});
374  }
375  case kDOUBLE: {
376  return cgen_state_->emitCall("load_double", {aggregate_state});
377  }
378  default: {
379  return cgen_state_->ir_builder_.CreateLoad(aggregate_state);
380  }
381  }
382 }
llvm::Value * aggregateWindowStatePtr()
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1447
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)
+ Here is the call graph for this function:

◆ codegenJoinLoops()

void Executor::codegenJoinLoops ( const std::vector< JoinLoop > &  join_loops,
const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
llvm::Function *  query_func,
llvm::BasicBlock *  entry_bb,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const ExecutionOptions eo 
)
private

Definition at line 653 of file IRCodegen.cpp.

References ExecutionOptions::allow_runtime_query_interrupt, AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, JoinLoop::codegen(), CgenState::context_, CgenState::current_func_, CompilationOptions::device_type, CgenState::ir_builder_, CgenState::llInt(), CgenState::needs_error_check_, CodeGenerator::posArg(), and ExecutionOptions::with_dynamic_watchdog.

660  {
662  const auto exit_bb =
663  llvm::BasicBlock::Create(cgen_state_->context_, "exit", cgen_state_->current_func_);
664  cgen_state_->ir_builder_.SetInsertPoint(exit_bb);
665  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
666  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
667  CodeGenerator code_generator(this);
668  const auto loops_entry_bb = JoinLoop::codegen(
669  join_loops,
670  [this,
671  query_func,
672  &query_mem_desc,
673  &co,
674  &eo,
675  &group_by_and_aggregate,
676  &join_loops,
677  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
679  addJoinLoopIterator(prev_iters, join_loops.size());
680  auto& builder = cgen_state_->ir_builder_;
681  const auto loop_body_bb = llvm::BasicBlock::Create(
682  builder.getContext(), "loop_body", builder.GetInsertBlock()->getParent());
683  builder.SetInsertPoint(loop_body_bb);
684  const bool can_return_error =
685  compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
686  if (can_return_error || cgen_state_->needs_error_check_ ||
688  createErrorCheckControlFlow(query_func,
691  co.device_type);
692  }
693  return loop_body_bb;
694  },
695  code_generator.posArg(nullptr),
696  exit_bb,
697  cgen_state_.get());
698  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
699  cgen_state_->ir_builder_.CreateBr(loops_entry_bb);
700 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
const bool with_dynamic_watchdog
static llvm::BasicBlock * codegen(const std::vector< JoinLoop > &join_loops, const std::function< llvm::BasicBlock *(const std::vector< llvm::Value *> &)> &body_codegen, llvm::Value *outer_iter, llvm::BasicBlock *exit_bb, CgenState *cgen_state)
Definition: JoinLoop.cpp:46
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type)
const bool allow_runtime_query_interrupt
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value *> &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:635
+ Here is the call graph for this function:

◆ codegenSkipDeletedOuterTableRow()

llvm::BasicBlock * Executor::codegenSkipDeletedOuterTableRow ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 2465 of file NativeCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CompilationOptions::filter_on_deleted_column, RelAlgExecutionUnit::input_descs, and TABLE.

2467  {
2469  if (!co.filter_on_deleted_column) {
2470  return nullptr;
2471  }
2472  CHECK(!ra_exe_unit.input_descs.empty());
2473  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2474  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2475  return nullptr;
2476  }
2477  const auto deleted_cd =
2478  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2479  if (!deleted_cd) {
2480  return nullptr;
2481  }
2482  CHECK(deleted_cd->columnType.is_boolean());
2483  const auto deleted_expr =
2484  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2485  outer_input_desc.getTableId(),
2486  deleted_cd->columnId,
2487  outer_input_desc.getNestLevel());
2488  CodeGenerator code_generator(this);
2489  const auto is_deleted =
2490  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2491  const auto is_deleted_bb = llvm::BasicBlock::Create(
2492  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2493  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2494  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2495  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2496  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2497  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2498  cgen_state_->ir_builder_.SetInsertPoint(bb);
2499  return bb;
2500 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:914
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:197

◆ codegenWindowAvgEpilogue()

void Executor::codegenWindowAvgEpilogue ( llvm::Value *  crt_val,
llvm::Value *  window_func_null_val,
llvm::Value *  multiplicity_lv 
)
private

Definition at line 289 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

291  {
293  const auto window_func_context =
295  const auto window_func = window_func_context->getWindowFunction();
296  const auto window_func_ti = get_adjusted_window_type_info(window_func);
297  const auto pi32_type =
298  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
299  const auto pi64_type =
300  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
301  const auto aggregate_state_type =
302  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
303  const auto aggregate_state_count_i64 = cgen_state_->llInt(
304  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
305  auto aggregate_state_count = cgen_state_->ir_builder_.CreateIntToPtr(
306  aggregate_state_count_i64, aggregate_state_type);
307  std::string agg_count_func_name = "agg_count";
308  switch (window_func_ti.get_type()) {
309  case kFLOAT: {
310  agg_count_func_name += "_float";
311  break;
312  }
313  case kDOUBLE: {
314  agg_count_func_name += "_double";
315  break;
316  }
317  default: {
318  break;
319  }
320  }
321  agg_count_func_name += "_skip_val";
322  cgen_state_->emitCall(agg_count_func_name,
323  {aggregate_state_count, crt_val, window_func_null_val});
324 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)
+ Here is the call graph for this function:

◆ codegenWindowFunction()

llvm::Value * Executor::codegenWindowFunction ( const size_t  target_index,
const CompilationOptions co 
)
private

Definition at line 21 of file WindowFunctionIR.cpp.

References WindowProjectNodeContext::activateWindowFunctionContext(), run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, COUNT, CUME_DIST, DENSE_RANK, logger::FATAL, FIRST_VALUE, WindowProjectNodeContext::get(), WindowFunctionContext::getWindowFunction(), LAG, LAST_VALUE, LEAD, LOG, MAX, MIN, NTILE, PERCENT_RANK, RANK, ROW_NUMBER, and SUM.

22  {
24  CodeGenerator code_generator(this);
25  const auto window_func_context =
27  target_index);
28  const auto window_func = window_func_context->getWindowFunction();
29  switch (window_func->getKind()) {
34  return cgen_state_->emitCall("row_number_window_func",
35  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
36  window_func_context->output())),
37  code_generator.posArg(nullptr)});
38  }
41  return cgen_state_->emitCall("percent_window_func",
42  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
43  window_func_context->output())),
44  code_generator.posArg(nullptr)});
45  }
51  const auto& args = window_func->getArgs();
52  CHECK(!args.empty());
53  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
54  CHECK_EQ(arg_lvs.size(), size_t(1));
55  return arg_lvs.front();
56  }
63  }
64  default: {
65  LOG(FATAL) << "Invalid window function kind";
66  }
67  }
68  return nullptr;
69 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
#define LOG(tag)
Definition: Logger.h:188
const WindowFunctionContext * activateWindowFunctionContext(Executor *executor, const size_t target_index) const
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
static const WindowProjectNodeContext * get(Executor *executor)
const Analyzer::WindowFunction * getWindowFunction() const
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenWindowFunctionAggregate(const CompilationOptions &co)
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:

◆ codegenWindowFunctionAggregate()

llvm::Value * Executor::codegenWindowFunctionAggregate ( const CompilationOptions co)
private

Definition at line 140 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, CHECK, WindowProjectNodeContext::get(), get_int_type(), and WindowProjectNodeContext::getActiveWindowFunctionContext().

140  {
142  const auto reset_state_false_bb = codegenWindowResetStateControlFlow();
143  auto aggregate_state = aggregateWindowStatePtr();
144  llvm::Value* aggregate_state_count = nullptr;
145  const auto window_func_context =
147  const auto window_func = window_func_context->getWindowFunction();
148  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
149  const auto aggregate_state_count_i64 = cgen_state_->llInt(
150  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
151  const auto pi64_type =
152  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
153  aggregate_state_count =
154  cgen_state_->ir_builder_.CreateIntToPtr(aggregate_state_count_i64, pi64_type);
155  }
156  codegenWindowFunctionStateInit(aggregate_state);
157  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
158  const auto count_zero = cgen_state_->llInt(int64_t(0));
159  cgen_state_->emitCall("agg_id", {aggregate_state_count, count_zero});
160  }
161  cgen_state_->ir_builder_.CreateBr(reset_state_false_bb);
162  cgen_state_->ir_builder_.SetInsertPoint(reset_state_false_bb);
164  return codegenWindowFunctionAggregateCalls(aggregate_state, co);
165 }
llvm::Value * aggregateWindowStatePtr()
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
static const WindowProjectNodeContext * get(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
void codegenWindowFunctionStateInit(llvm::Value *aggregate_state)
#define CHECK(condition)
Definition: Logger.h:197
llvm::Value * codegenWindowFunctionAggregateCalls(llvm::Value *aggregate_state, const CompilationOptions &co)
llvm::BasicBlock * codegenWindowResetStateControlFlow()
+ Here is the call graph for this function:

◆ codegenWindowFunctionAggregateCalls()

llvm::Value * Executor::codegenWindowFunctionAggregateCalls ( llvm::Value *  aggregate_state,
const CompilationOptions co 
)
private

Definition at line 246 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, CodeGenerator::codegen(), CodeGenerator::codegenCastBetweenIntTypes(), COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), anonymous_namespace{WindowFunctionIR.cpp}::get_window_agg_name(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kFLOAT, and SUM.

247  {
249  const auto window_func_context =
251  const auto window_func = window_func_context->getWindowFunction();
252  const auto window_func_ti = get_adjusted_window_type_info(window_func);
253  const auto window_func_null_val =
254  window_func_ti.is_fp()
255  ? cgen_state_->inlineFpNull(window_func_ti)
256  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
257  const auto& args = window_func->getArgs();
258  llvm::Value* crt_val;
259  if (args.empty()) {
260  CHECK(window_func->getKind() == SqlWindowFunctionKind::COUNT);
261  crt_val = cgen_state_->llInt(int64_t(1));
262  } else {
263  CodeGenerator code_generator(this);
264  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
265  CHECK_EQ(arg_lvs.size(), size_t(1));
266  if (window_func->getKind() == SqlWindowFunctionKind::SUM && !window_func_ti.is_fp()) {
267  crt_val = code_generator.codegenCastBetweenIntTypes(
268  arg_lvs.front(), args.front()->get_type_info(), window_func_ti, false);
269  } else {
270  crt_val = window_func_ti.get_type() == kFLOAT
271  ? arg_lvs.front()
272  : cgen_state_->castToTypeIn(arg_lvs.front(), 64);
273  }
274  }
275  const auto agg_name = get_window_agg_name(window_func->getKind(), window_func_ti);
276  llvm::Value* multiplicity_lv = nullptr;
277  if (args.empty()) {
278  cgen_state_->emitCall(agg_name, {aggregate_state, crt_val});
279  } else {
280  cgen_state_->emitCall(agg_name + "_skip_val",
281  {aggregate_state, crt_val, window_func_null_val});
282  }
283  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
284  codegenWindowAvgEpilogue(crt_val, window_func_null_val, multiplicity_lv);
285  }
287 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string get_window_agg_name(const SqlWindowFunctionKind kind, const SQLTypeInfo &window_func_ti)
void codegenWindowAvgEpilogue(llvm::Value *crt_val, llvm::Value *window_func_null_val, llvm::Value *multiplicity_lv)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenAggregateWindowState()
#define CHECK(condition)
Definition: Logger.h:197
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)
+ Here is the call graph for this function:

◆ codegenWindowFunctionStateInit()

void Executor::codegenWindowFunctionStateInit ( llvm::Value *  aggregate_state)
private

Definition at line 196 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

196  {
198  const auto window_func_context =
200  const auto window_func = window_func_context->getWindowFunction();
201  const auto window_func_ti = get_adjusted_window_type_info(window_func);
202  const auto window_func_null_val =
203  window_func_ti.is_fp()
204  ? cgen_state_->inlineFpNull(window_func_ti)
205  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
206  llvm::Value* window_func_init_val;
207  if (window_func_context->getWindowFunction()->getKind() ==
209  switch (window_func_ti.get_type()) {
210  case kFLOAT: {
211  window_func_init_val = cgen_state_->llFp(float(0));
212  break;
213  }
214  case kDOUBLE: {
215  window_func_init_val = cgen_state_->llFp(double(0));
216  break;
217  }
218  default: {
219  window_func_init_val = cgen_state_->llInt(int64_t(0));
220  break;
221  }
222  }
223  } else {
224  window_func_init_val = window_func_null_val;
225  }
226  const auto pi32_type =
227  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
228  switch (window_func_ti.get_type()) {
229  case kDOUBLE: {
230  cgen_state_->emitCall("agg_id_double", {aggregate_state, window_func_init_val});
231  break;
232  }
233  case kFLOAT: {
234  aggregate_state =
235  cgen_state_->ir_builder_.CreateBitCast(aggregate_state, pi32_type);
236  cgen_state_->emitCall("agg_id_float", {aggregate_state, window_func_init_val});
237  break;
238  }
239  default: {
240  cgen_state_->emitCall("agg_id", {aggregate_state, window_func_init_val});
241  break;
242  }
243  }
244 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)
+ Here is the call graph for this function:

◆ codegenWindowResetStateControlFlow()

llvm::BasicBlock * Executor::codegenWindowResetStateControlFlow ( )
private

Definition at line 167 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, WindowProjectNodeContext::getActiveWindowFunctionContext(), and CodeGenerator::toBool().

167  {
169  const auto window_func_context =
171  const auto bitset = cgen_state_->llInt(
172  reinterpret_cast<const int64_t>(window_func_context->partitionStart()));
173  const auto min_val = cgen_state_->llInt(int64_t(0));
174  const auto max_val = cgen_state_->llInt(window_func_context->elementCount() - 1);
175  const auto null_val = cgen_state_->llInt(inline_int_null_value<int64_t>());
176  const auto null_bool_val = cgen_state_->llInt<int8_t>(inline_int_null_value<int8_t>());
177  CodeGenerator code_generator(this);
178  const auto reset_state =
179  code_generator.toBool(cgen_state_->emitCall("bit_is_set",
180  {bitset,
181  code_generator.posArg(nullptr),
182  min_val,
183  max_val,
184  null_val,
185  null_bool_val}));
186  const auto reset_state_true_bb = llvm::BasicBlock::Create(
187  cgen_state_->context_, "reset_state.true", cgen_state_->current_func_);
188  const auto reset_state_false_bb = llvm::BasicBlock::Create(
189  cgen_state_->context_, "reset_state.false", cgen_state_->current_func_);
190  cgen_state_->ir_builder_.CreateCondBr(
191  reset_state, reset_state_true_bb, reset_state_false_bb);
192  cgen_state_->ir_builder_.SetInsertPoint(reset_state_true_bb);
193  return reset_state_false_bb;
194 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
+ Here is the call graph for this function:

◆ collectAllDeviceResults()

ResultSetPtr Executor::collectAllDeviceResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner 
)
private

Definition at line 1750 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::build_row_for_empty_input(), catalog_, collectAllDeviceShardedTopResults(), DEBUG_TIMER, SharedKernelContext::getFragmentResults(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, NonGroupedAggregate, reduceMultiDeviceResults(), reduceSpeculativeTopN(), GroupByAndAggregate::shard_count_for_top_groups(), RelAlgExecutionUnit::target_exprs, and use_speculative_top_n().

Referenced by executeWorkUnitImpl().

1755  {
1756  auto timer = DEBUG_TIMER(__func__);
1757  auto& result_per_device = shared_context.getFragmentResults();
1758  if (result_per_device.empty() && query_mem_desc.getQueryDescriptionType() ==
1761  ra_exe_unit.target_exprs, query_mem_desc, device_type);
1762  }
1763  if (use_speculative_top_n(ra_exe_unit, query_mem_desc)) {
1764  try {
1765  return reduceSpeculativeTopN(
1766  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
1767  } catch (const std::bad_alloc&) {
1768  throw SpeculativeTopNFailed("Failed during multi-device reduction.");
1769  }
1770  }
1771  const auto shard_count =
1772  device_type == ExecutorDeviceType::GPU
1774  : 0;
1775 
1776  if (shard_count && !result_per_device.empty()) {
1777  return collectAllDeviceShardedTopResults(shared_context, ra_exe_unit);
1778  }
1779  return reduceMultiDeviceResults(
1780  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
1781 }
ResultSetPtr build_row_for_empty_input(const std::vector< Analyzer::Expr *> &target_exprs_in, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
Definition: Execute.cpp:1713
std::vector< Analyzer::Expr * > target_exprs
ResultSetPtr reduceMultiDeviceResults(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:869
bool use_speculative_top_n(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc)
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
ResultSetPtr reduceSpeculativeTopN(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:969
ResultSetPtr collectAllDeviceShardedTopResults(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit) const
Definition: Execute.cpp:1865
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define DEBUG_TIMER(name)
Definition: Logger.h:313
QueryDescriptionType getQueryDescriptionType() const
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ collectAllDeviceShardedTopResults()

ResultSetPtr Executor::collectAllDeviceShardedTopResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit 
) const
private

Definition at line 1865 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LE, SharedKernelContext::getFragmentResults(), SortInfo::limit, SortInfo::offset, SortInfo::order_entries, anonymous_namespace{Execute.cpp}::permute_storage_columnar(), anonymous_namespace{Execute.cpp}::permute_storage_row_wise(), run_benchmark_import::result, and RelAlgExecutionUnit::sort_info.

Referenced by collectAllDeviceResults().

1867  {
1868  auto& result_per_device = shared_context.getFragmentResults();
1869  const auto first_result_set = result_per_device.front().first;
1870  CHECK(first_result_set);
1871  auto top_query_mem_desc = first_result_set->getQueryMemDesc();
1872  CHECK(!top_query_mem_desc.hasInterleavedBinsOnGpu());
1873  const auto top_n = ra_exe_unit.sort_info.limit + ra_exe_unit.sort_info.offset;
1874  top_query_mem_desc.setEntryCount(0);
1875  for (auto& result : result_per_device) {
1876  const auto result_set = result.first;
1877  CHECK(result_set);
1878  result_set->sort(ra_exe_unit.sort_info.order_entries, top_n);
1879  size_t new_entry_cnt = top_query_mem_desc.getEntryCount() + result_set->rowCount();
1880  top_query_mem_desc.setEntryCount(new_entry_cnt);
1881  }
1882  auto top_result_set = std::make_shared<ResultSet>(first_result_set->getTargetInfos(),
1883  first_result_set->getDeviceType(),
1884  top_query_mem_desc,
1885  first_result_set->getRowSetMemOwner(),
1886  this);
1887  auto top_storage = top_result_set->allocateStorage();
1888  size_t top_output_row_idx{0};
1889  for (auto& result : result_per_device) {
1890  const auto result_set = result.first;
1891  CHECK(result_set);
1892  const auto& top_permutation = result_set->getPermutationBuffer();
1893  CHECK_LE(top_permutation.size(), top_n);
1894  if (top_query_mem_desc.didOutputColumnar()) {
1895  top_output_row_idx = permute_storage_columnar(result_set->getStorage(),
1896  result_set->getQueryMemDesc(),
1897  top_storage,
1898  top_output_row_idx,
1899  top_query_mem_desc,
1900  top_permutation);
1901  } else {
1902  top_output_row_idx = permute_storage_row_wise(result_set->getStorage(),
1903  top_storage,
1904  top_output_row_idx,
1905  top_query_mem_desc,
1906  top_permutation);
1907  }
1908  }
1909  CHECK_EQ(top_output_row_idx, top_query_mem_desc.getEntryCount());
1910  return top_result_set;
1911 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const std::list< Analyzer::OrderEntry > order_entries
size_t permute_storage_row_wise(const ResultSetStorage *input_storage, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:1844
const size_t limit
const SortInfo sort_info
#define CHECK_LE(x, y)
Definition: Logger.h:208
size_t permute_storage_columnar(const ResultSetStorage *input_storage, const QueryMemoryDescriptor &input_query_mem_desc, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:1794
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define CHECK(condition)
Definition: Logger.h:197
const size_t offset
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ compileBody()

bool Executor::compileBody ( const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const GpuSharedMemoryContext gpu_smem_context = {} 
)
private

Definition at line 2502 of file NativeCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CodeGenerator::codegen(), GroupByAndAggregate::codegen(), get_int_type(), RelAlgExecutionUnit::join_quals, CodeGenerator::prioritizeQuals(), to_string(), CodeGenerator::toBool(), and VLOG.

2506  {
2508 
2509  // Switch the code generation into a separate filter function if enabled.
2510  // Note that accesses to function arguments are still codegenned from the
2511  // row function's arguments, then later automatically forwarded and
2512  // remapped into filter function arguments by redeclareFilterFunction().
2513  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2514  llvm::Value* loop_done{nullptr};
2515  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2516  if (cgen_state_->filter_func_) {
2517  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2518  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2519  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2520  row_func_entry_bb->begin());
2521  loop_done = cgen_state_->ir_builder_.CreateAlloca(
2522  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
2523  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2524  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
2525  }
2526  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2527  cgen_state_->current_func_ = cgen_state_->filter_func_;
2528  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2529  }
2530 
2531  // generate the code for the filter
2532  std::vector<Analyzer::Expr*> primary_quals;
2533  std::vector<Analyzer::Expr*> deferred_quals;
2534  bool short_circuited =
2535  CodeGenerator::prioritizeQuals(ra_exe_unit, primary_quals, deferred_quals);
2536  if (short_circuited) {
2537  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
2538  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
2539  << " quals";
2540  }
2541  llvm::Value* filter_lv = cgen_state_->llBool(true);
2542  CodeGenerator code_generator(this);
2543  for (auto expr : primary_quals) {
2544  // Generate the filter for primary quals
2545  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
2546  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2547  }
2548  CHECK(filter_lv->getType()->isIntegerTy(1));
2549  llvm::BasicBlock* sc_false{nullptr};
2550  if (!deferred_quals.empty()) {
2551  auto sc_true = llvm::BasicBlock::Create(
2552  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
2553  sc_false = llvm::BasicBlock::Create(
2554  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
2555  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2556  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2557  if (ra_exe_unit.join_quals.empty()) {
2558  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2559  }
2560  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2561  filter_lv = cgen_state_->llBool(true);
2562  }
2563  for (auto expr : deferred_quals) {
2564  filter_lv = cgen_state_->ir_builder_.CreateAnd(
2565  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
2566  }
2567 
2568  CHECK(filter_lv->getType()->isIntegerTy(1));
2569  auto ret = group_by_and_aggregate.codegen(
2570  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2571 
2572  // Switch the code generation back to the row function if a filter
2573  // function was enabled.
2574  if (cgen_state_->filter_func_) {
2575  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2576  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
2577  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2578  }
2579 
2581 
2582  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2583  cgen_state_->current_func_ = cgen_state_->row_func_;
2584  cgen_state_->filter_func_call_ =
2585  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
2586 
2587  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2588  auto loop_done_true = llvm::BasicBlock::Create(
2589  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
2590  auto loop_done_false = llvm::BasicBlock::Create(
2591  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
2592  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
2593  cgen_state_->ir_builder_.CreateCondBr(
2594  loop_done_flag, loop_done_true, loop_done_false);
2595  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
2596  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2597  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
2598  } else {
2599  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2600  }
2601  }
2602  return ret;
2603 }
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
const JoinQualsPerNestingLevel join_quals
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr *> &primary_quals, std::vector< Analyzer::Expr *> &deferred_quals)
Definition: LogicalIR.cpp:157
#define CHECK(condition)
Definition: Logger.h:197
void redeclareFilterFunction()
Definition: IRCodegen.cpp:537
#define VLOG(n)
Definition: Logger.h:291
+ Here is the call graph for this function:

◆ compileWorkUnit()

std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > Executor::compileWorkUnit ( const std::vector< InputTableInfo > &  query_infos,
const PlanState::DeletedColumnsMap deleted_cols_map,
const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const CudaMgr_Namespace::CudaMgr cuda_mgr,
const bool  allow_lazy_fetch,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const size_t  max_groups_buffer_entry_count,
const int8_t  crt_min_byte_width,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache,
RenderInfo render_info = nullptr 
)
private

Definition at line 2076 of file NativeCodegen.cpp.

References ExecutionOptions::allow_multifrag, ExecutionOptions::allow_runtime_query_interrupt, CodeGenerator::alwaysCloneRuntimeFunction(), logger::ASM, AUTOMATIC_IR_METADATA, AUTOMATIC_IR_METADATA_DONE, anonymous_namespace{NativeCodegen.cpp}::bind_pos_placeholders(), anonymous_namespace{NativeCodegen.cpp}::bind_query(), CHECK, CHECK_EQ, GpuSharedMemCodeBuilder::codegen(), CPU, anonymous_namespace{NativeCodegen.cpp}::create_row_function(), logger::DEBUG1, DEBUG_TIMER, CompilationOptions::device_type, RelAlgExecutionUnit::estimator, CompilationOptions::explain_type, g_enable_filter_function, g_gpu_smem_threshold, generate_column_heads_load(), anonymous_namespace{NativeCodegen.cpp}::get_agg_fnames(), get_arg_by_name(), get_gpu_data_layout(), get_gpu_target_triple_string(), get_int_type(), anonymous_namespace{NativeCodegen.cpp}::get_shared_memory_size(), GroupByAndAggregate::getColRangeInfo(), GpuSharedMemCodeBuilder::getInitFunction(), GpuSharedMemCodeBuilder::getReductionFunction(), GPU, RelAlgExecutionUnit::groupby_exprs, GroupByBaselineHash, GroupByPerfectHash, CompilationOptions::hoist_literals, init_agg_val_vec(), GroupByAndAggregate::initQueryMemoryDescriptor(), GpuSharedMemCodeBuilder::injectFunctionsInto(), RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, Invalid, logger::IR, anonymous_namespace{NativeCodegen.cpp}::is_gpu_shared_mem_supported(), is_rt_udf_module_present(), is_udf_module_present(), RenderInfo::isPotentialInSituRender(), GpuSharedMemoryContext::isSharedMemoryUsed(), ExecutionOptions::just_explain, CodeGenerator::link_udf_module(), LOG, mark_function_always_inline(), CodeGenerator::markDeadRuntimeFuncs(), anonymous_namespace{NativeCodegen.cpp}::optimize_ir(), Optimized, ExecutionOptions::output_columnar_hint, logger::PTX, RelAlgExecutionUnit::quals, query_group_by_template(), query_template(), rt_udf_cpu_module, rt_udf_gpu_module, RelAlgExecutionUnit::scan_limit, anonymous_namespace{NativeCodegen.cpp}::serialize_llvm_metadata_footnotes(), serialize_llvm_object(), StdSet, RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), to_string(), GpuSharedMemCodeBuilder::toString(), udf_cpu_module, udf_gpu_module, verify_function_ir(), VLOG, and ExecutionOptions::with_dynamic_watchdog.

2088  {
2089  auto timer = DEBUG_TIMER(__func__);
2090 
2091 #ifndef NDEBUG
2092  static std::uint64_t counter = 0;
2093  ++counter;
2094  VLOG(1) << "CODEGEN #" << counter << ":";
2095  LOG(IR) << "CODEGEN #" << counter << ":";
2096  LOG(PTX) << "CODEGEN #" << counter << ":";
2097  LOG(ASM) << "CODEGEN #" << counter << ":";
2098 #endif
2099 
2100  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2101 
2102  GroupByAndAggregate group_by_and_aggregate(
2103  this,
2104  co.device_type,
2105  ra_exe_unit,
2106  query_infos,
2107  row_set_mem_owner,
2108  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2109  : std::nullopt);
2110  auto query_mem_desc =
2111  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2112  max_groups_buffer_entry_guess,
2113  crt_min_byte_width,
2114  render_info,
2116 
2117  if (query_mem_desc->getQueryDescriptionType() ==
2119  !has_cardinality_estimation &&
2120  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2121  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2122  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2123  }
2124 
2125  const bool output_columnar = query_mem_desc->didOutputColumnar();
2126  const bool gpu_shared_mem_optimization =
2127  is_gpu_shared_mem_supported(query_mem_desc.get(),
2128  ra_exe_unit,
2129  cuda_mgr,
2130  co.device_type,
2131  cuda_mgr ? this->blockSize() : 1,
2132  cuda_mgr ? this->numBlocksPerMP() : 1);
2133  if (gpu_shared_mem_optimization) {
2134  // disable interleaved bins optimization on the GPU
2135  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2136  LOG(DEBUG1) << "GPU shared memory is used for the " +
2137  query_mem_desc->queryDescTypeToString() + " query(" +
2138  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2139  query_mem_desc.get())) +
2140  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2141  }
2142 
2143  const GpuSharedMemoryContext gpu_smem_context(
2144  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2145 
2147  const size_t num_count_distinct_descs =
2148  query_mem_desc->getCountDistinctDescriptorsSize();
2149  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2150  const auto& count_distinct_descriptor =
2151  query_mem_desc->getCountDistinctDescriptor(i);
2152  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2153  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2154  !co.hoist_literals)) {
2155  throw QueryMustRunOnCpu();
2156  }
2157  }
2158  }
2159 
2160  // Read the module template and target either CPU or GPU
2161  // by binding the stream position functions to the right implementation:
2162  // stride access for GPU, contiguous for CPU
2163  auto rt_module_copy = llvm::CloneModule(
2164 #if LLVM_VERSION_MAJOR >= 7
2165  *g_rt_module.get(),
2166 #else
2167  g_rt_module.get(),
2168 #endif
2169  cgen_state_->vmap_,
2170  [](const llvm::GlobalValue* gv) {
2171  auto func = llvm::dyn_cast<llvm::Function>(gv);
2172  if (!func) {
2173  return true;
2174  }
2175  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2176  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2178  });
2179 
2181  if (is_udf_module_present(true)) {
2183  }
2184  if (is_rt_udf_module_present(true)) {
2186  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2187  }
2188  } else {
2189  rt_module_copy->setDataLayout(get_gpu_data_layout());
2190  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2191 
2192  if (is_udf_module_present()) {
2193  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
2194 
2195  if (!gpu_triple.isNVPTX()) {
2196  throw QueryMustRunOnCpu();
2197  }
2198 
2200  }
2201  if (is_rt_udf_module_present()) {
2203  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2204  }
2205  }
2206 
2207  cgen_state_->module_ = rt_module_copy.release();
2209 
2210  auto agg_fnames =
2211  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2212 
2213  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2214 
2215  const bool is_group_by{query_mem_desc->isGroupBy()};
2216  auto [query_func, row_func_call] = is_group_by
2218  co.hoist_literals,
2219  *query_mem_desc,
2220  co.device_type,
2221  ra_exe_unit.scan_limit,
2222  gpu_smem_context)
2223  : query_template(cgen_state_->module_,
2224  agg_slot_count,
2225  co.hoist_literals,
2226  !!ra_exe_unit.estimator,
2227  gpu_smem_context);
2228  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2229  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2230  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2231 
2232  cgen_state_->query_func_ = query_func;
2233  cgen_state_->row_func_call_ = row_func_call;
2234  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2235  &query_func->getEntryBlock().front());
2236 
2237  // Generate the function signature and column head fetches s.t.
2238  // double indirection isn't needed in the inner loop
2239  auto& fetch_bb = query_func->front();
2240  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2241  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2242  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2243  query_func->args().begin(),
2244  fetch_ir_builder,
2245  cgen_state_->context_);
2246  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2247 
2248  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2249  is_group_by ? 0 : agg_slot_count,
2250  co.hoist_literals,
2251  cgen_state_->module_,
2252  cgen_state_->context_);
2253  CHECK(cgen_state_->row_func_);
2254  cgen_state_->row_func_bb_ =
2255  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2256 
2258  auto filter_func_ft =
2259  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2260  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2261  llvm::Function::ExternalLinkage,
2262  "filter_func",
2263  cgen_state_->module_);
2264  CHECK(cgen_state_->filter_func_);
2265  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2266  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2267  }
2268 
2269  cgen_state_->current_func_ = cgen_state_->row_func_;
2270  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2271 
2272  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2273  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2274  const auto join_loops =
2275  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2276 
2277  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2278  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2279  if (is_not_deleted_bb) {
2280  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2281  }
2282  if (!join_loops.empty()) {
2283  codegenJoinLoops(join_loops,
2284  body_execution_unit,
2285  group_by_and_aggregate,
2286  query_func,
2287  cgen_state_->row_func_bb_,
2288  *(query_mem_desc.get()),
2289  co,
2290  eo);
2291  } else {
2292  const bool can_return_error = compileBody(
2293  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2294  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2296  createErrorCheckControlFlow(query_func,
2299  co.device_type);
2300  }
2301  }
2302  std::vector<llvm::Value*> hoisted_literals;
2303 
2304  if (co.hoist_literals) {
2305  VLOG(1) << "number of hoisted literals: "
2306  << cgen_state_->query_func_literal_loads_.size()
2307  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2308  << " bytes";
2309  }
2310 
2311  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2312  // we have some hoisted literals...
2313  hoisted_literals = inlineHoistedLiterals();
2314  }
2315 
2316  // replace the row func placeholder call with the call to the actual row func
2317  std::vector<llvm::Value*> row_func_args;
2318  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2319  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2320  }
2321  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2322  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2323  // push hoisted literals arguments, if any
2324  row_func_args.insert(
2325  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2326  llvm::ReplaceInstWithInst(
2327  cgen_state_->row_func_call_,
2328  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2329 
2330  // replace the filter func placeholder call with the call to the actual filter func
2331  if (cgen_state_->filter_func_) {
2332  std::vector<llvm::Value*> filter_func_args;
2333  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2334  arg_it != cgen_state_->filter_func_args_.end();
2335  ++arg_it) {
2336  filter_func_args.push_back(arg_it->first);
2337  }
2338  llvm::ReplaceInstWithInst(
2339  cgen_state_->filter_func_call_,
2340  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2341  }
2342 
2343  // Aggregate
2344  plan_state_->init_agg_vals_ =
2345  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2346 
2347  /*
2348  * If we have decided to use GPU shared memory (decision is not made here), then
2349  * we generate proper code for extra components that it needs (buffer initialization and
2350  * gpu reduction from shared memory to global memory). We then replace these functions
2351  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2352  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2353  */
2354  if (gpu_smem_context.isSharedMemoryUsed()) {
2355  if (query_mem_desc->getQueryDescriptionType() ==
2357  GpuSharedMemCodeBuilder gpu_smem_code(
2358  cgen_state_->module_,
2359  cgen_state_->context_,
2360  *query_mem_desc,
2361  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc),
2362  plan_state_->init_agg_vals_);
2363  gpu_smem_code.codegen();
2364  gpu_smem_code.injectFunctionsInto(query_func);
2365 
2366  // helper functions are used for caching purposes later
2367  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2368  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2369  LOG(IR) << gpu_smem_code.toString();
2370  }
2371  }
2372 
2373  auto multifrag_query_func = cgen_state_->module_->getFunction(
2374  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2375  CHECK(multifrag_query_func);
2376 
2377  bind_query(query_func,
2378  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2379  multifrag_query_func,
2380  cgen_state_->module_);
2381 
2382  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2383  if (cgen_state_->filter_func_) {
2384  root_funcs.push_back(cgen_state_->filter_func_);
2385  }
2386  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2387  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2388 
2389  // Always inline the row function and the filter function.
2390  // We don't want register spills in the inner loops.
2391  // LLVM seems to correctly free up alloca instructions
2392  // in these functions even when they are inlined.
2394  if (cgen_state_->filter_func_) {
2396  }
2397 
2398 #ifndef NDEBUG
2399  // Add helpful metadata to the LLVM IR for debugging.
2401 #endif
2402 
2403  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2404  std::string llvm_ir;
2405  if (eo.just_explain) {
2407 #ifdef WITH_JIT_DEBUG
2408  throw std::runtime_error(
2409  "Explain optimized not available when JIT runtime debug symbols are enabled");
2410 #else
2411  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2412  // optimized IR after NVVM reflect
2413  llvm::legacy::PassManager pass_manager;
2414  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2415 #endif // WITH_JIT_DEBUG
2416  }
2417  llvm_ir =
2418  serialize_llvm_object(query_func) +
2419  serialize_llvm_object(cgen_state_->row_func_) +
2420  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2421  : "");
2422 
2423 #ifndef NDEBUG
2424  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2425 #endif
2426  }
2427 
2428  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2429  LOG(IR) << "IR for the "
2430  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2431 #ifdef NDEBUG
2432  LOG(IR) << serialize_llvm_object(query_func)
2433  << serialize_llvm_object(cgen_state_->row_func_)
2434  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2435  : "")
2436  << "\nEnd of IR";
2437 #else
2438  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2439 #endif
2440 
2441  // Run some basic validation checks on the LLVM IR before code is generated below.
2442  verify_function_ir(cgen_state_->row_func_);
2443  if (cgen_state_->filter_func_) {
2444  verify_function_ir(cgen_state_->filter_func_);
2445  }
2446 
2447  // Generate final native code from the LLVM IR.
2448  return std::make_tuple(
2451  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2452  : optimizeAndCodegenGPU(query_func,
2453  multifrag_query_func,
2454  live_funcs,
2455  is_group_by || ra_exe_unit.estimator,
2456  cuda_mgr,
2457  co),
2458  cgen_state_->getLiterals(),
2459  output_columnar,
2460  llvm_ir,
2461  std::move(gpu_smem_context)},
2462  std::move(query_mem_desc));
2463 }
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
bool is_udf_module_present(bool cpu_only)
std::unique_ptr< llvm::Module > rt_udf_cpu_module
std::unique_ptr< llvm::Module > udf_cpu_module
void codegenJoinLoops(const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
Definition: IRCodegen.cpp:653
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void mark_function_always_inline(llvm::Function *func)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function *> &roots, const std::vector< llvm::Function *> &leaves)
llvm::StringRef get_gpu_data_layout()
void verify_function_ir(const llvm::Function *func)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
std::unique_ptr< llvm::Module > udf_gpu_module
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::string to_string(char const *&&v)
void preloadFragOffsets(const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
Definition: Execute.cpp:3048
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
llvm::StringRef get_gpu_target_triple_string()
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
const bool allow_multifrag
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function *> &, const CompilationOptions &)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:129
const bool with_dynamic_watchdog
ExecutorExplainType explain_type
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:914
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr *> &target_exprs, const bool is_group_by)
std::unique_ptr< llvm::Module > g_rt_module
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function *> &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function *> &live_funcs, const CompilationOptions &co)
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
std::string serialize_llvm_object(const T *llvm_obj)
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type)
bool g_enable_filter_function
Definition: Execute.cpp:77
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
void nukeOldState(const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
Definition: Execute.cpp:3029
unsigned numBlocksPerMP() const
Definition: Execute.cpp:3115
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr *> &targets, const QueryMemoryDescriptor &query_mem_desc)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
std::vector< llvm::Value * > inlineHoistedLiterals()
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:260
const bool allow_runtime_query_interrupt
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
#define VLOG(n)
Definition: Logger.h:291
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
unsigned blockSize() const
Definition: Execute.cpp:3123
bool is_rt_udf_module_present(bool cpu_only)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:111
+ Here is the call graph for this function:

◆ computeColRangesCache()

AggregatedColRange Executor::computeColRangesCache ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 3469 of file Execute.cpp.

References catalog_, CHECK, getLeafColumnRange(), Catalog_Namespace::Catalog::getMetadataForColumn(), getTableInfo(), AggregatedColRange::setColRange(), and ExpressionRange::typeSupportsRange().

Referenced by setupCaching().

3470  {
3471  AggregatedColRange agg_col_range_cache;
3472  CHECK(catalog_);
3473  std::unordered_set<int> phys_table_ids;
3474  for (const auto& phys_input : phys_inputs) {
3475  phys_table_ids.insert(phys_input.table_id);
3476  }
3477  std::vector<InputTableInfo> query_infos;
3478  for (const int table_id : phys_table_ids) {
3479  query_infos.emplace_back(InputTableInfo{table_id, getTableInfo(table_id)});
3480  }
3481  for (const auto& phys_input : phys_inputs) {
3482  const auto cd =
3483  catalog_->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
3484  CHECK(cd);
3485  if (ExpressionRange::typeSupportsRange(cd->columnType)) {
3486  const auto col_var = boost::make_unique<Analyzer::ColumnVar>(
3487  cd->columnType, phys_input.table_id, phys_input.col_id, 0);
3488  const auto col_range = getLeafColumnRange(col_var.get(), query_infos, this, false);
3489  agg_col_range_cache.setColRange(phys_input, col_range);
3490  }
3491  }
3492  return agg_col_range_cache;
3493 }
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
Fragmenter_Namespace::TableInfo getTableInfo(const int table_id) const
Definition: Execute.cpp:264
ExpressionRange getLeafColumnRange(const Analyzer::ColumnVar *col_expr, const std::vector< InputTableInfo > &query_infos, const Executor *executor, const bool is_outer_join_proj)
#define CHECK(condition)
Definition: Logger.h:197
void setColRange(const PhysicalInput &, const ExpressionRange &)
static bool typeSupportsRange(const SQLTypeInfo &ti)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ computeStringDictionaryGenerations()

StringDictionaryGenerations Executor::computeStringDictionaryGenerations ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 3495 of file Execute.cpp.

References catalog_, CHECK, Catalog_Namespace::Catalog::getMetadataForColumn(), Catalog_Namespace::Catalog::getMetadataForDict(), kENCODING_DICT, and StringDictionaryGenerations::setGeneration().

Referenced by setupCaching().

3496  {
3497  StringDictionaryGenerations string_dictionary_generations;
3498  CHECK(catalog_);
3499  for (const auto& phys_input : phys_inputs) {
3500  const auto cd =
3501  catalog_->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
3502  CHECK(cd);
3503  const auto& col_ti =
3504  cd->columnType.is_array() ? cd->columnType.get_elem_type() : cd->columnType;
3505  if (col_ti.is_string() && col_ti.get_compression() == kENCODING_DICT) {
3506  const int dict_id = col_ti.get_comp_param();
3507  const auto dd = catalog_->getMetadataForDict(dict_id);
3508  CHECK(dd && dd->stringDict);
3509  string_dictionary_generations.setGeneration(dict_id,
3510  dd->stringDict->storageEntryCount());
3511  }
3512  }
3513  return string_dictionary_generations;
3514 }
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
void setGeneration(const uint32_t id, const uint64_t generation)
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
const DictDescriptor * getMetadataForDict(int dict_ref, bool loadDict=true) const
Definition: Catalog.cpp:1451
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ computeTableGenerations()

TableGenerations Executor::computeTableGenerations ( std::unordered_set< int >  phys_table_ids)
private

Definition at line 3516 of file Execute.cpp.

References getTableInfo(), and TableGenerations::setGeneration().

Referenced by setupCaching().

3517  {
3518  TableGenerations table_generations;
3519  for (const int table_id : phys_table_ids) {
3520  const auto table_info = getTableInfo(table_id);
3521  table_generations.setGeneration(
3522  table_id,
3523  TableGeneration{static_cast<int64_t>(table_info.getPhysicalNumTuples()), 0});
3524  }
3525  return table_generations;
3526 }
Fragmenter_Namespace::TableInfo getTableInfo(const int table_id) const
Definition: Execute.cpp:264
void setGeneration(const uint32_t id, const TableGeneration &generation)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ containsLeftDeepOuterJoin()

bool Executor::containsLeftDeepOuterJoin ( ) const
inline

Definition at line 372 of file Execute.h.

372  {
373  return cgen_state_->contains_left_deep_outer_join_;
374  }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899

◆ createErrorCheckControlFlow()

void Executor::createErrorCheckControlFlow ( llvm::Function *  query_func,
bool  run_with_dynamic_watchdog,
bool  run_with_allowing_runtime_interrupt,
ExecutorDeviceType  device_type 
)
private

Definition at line 1488 of file NativeCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, ERR_INTERRUPTED, ERR_OUT_OF_TIME, get_arg_by_name(), getExpOfTwo(), GPU, and isPowOfTwo().

1491  {
1493 
1494  // check whether the row processing was successful; currently, it can
1495  // fail by running out of group by buffer slots
1496 
1497  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1498  // when both dynamic watchdog and runtime interrupt turns on
1499  // we use dynamic watchdog
1500  run_with_allowing_runtime_interrupt = false;
1501  }
1502 
1503  llvm::Value* row_count = nullptr;
1504  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1505  device_type == ExecutorDeviceType::GPU) {
1506  row_count =
1507  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1508  }
1509 
1510  bool done_splitting = false;
1511  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1512  ++bb_it) {
1513  llvm::Value* pos = nullptr;
1514  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1515  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1516  llvm::isa<llvm::PHINode>(*inst_it)) {
1517  if (inst_it->getName() == "pos") {
1518  pos = &*inst_it;
1519  }
1520  continue;
1521  }
1522  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1523  continue;
1524  }
1525  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1526  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1527  auto next_inst_it = inst_it;
1528  ++next_inst_it;
1529  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1530  auto& br_instr = bb_it->back();
1531  llvm::IRBuilder<> ir_builder(&br_instr);
1532  llvm::Value* err_lv = &*inst_it;
1533  if (run_with_dynamic_watchdog) {
1534  CHECK(pos);
1535  llvm::Value* call_watchdog_lv = nullptr;
1536  if (device_type == ExecutorDeviceType::GPU) {
1537  // In order to make sure all threads within a block see the same barrier,
1538  // only those blocks whose none of their threads have experienced the critical
1539  // edge will go through the dynamic watchdog computation
1540  CHECK(row_count);
1541  auto crit_edge_rem =
1542  (blockSize() & (blockSize() - 1))
1543  ? ir_builder.CreateSRem(
1544  row_count,
1545  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1546  : ir_builder.CreateAnd(
1547  row_count,
1548  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1549  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1550  crit_edge_threshold->setName("crit_edge_threshold");
1551 
1552  // only those threads where pos < crit_edge_threshold go through dynamic
1553  // watchdog call
1554  call_watchdog_lv =
1555  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1556  } else {
1557  // CPU path: run watchdog for every 64th row
1558  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1559  call_watchdog_lv = ir_builder.CreateICmp(
1560  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1561  }
1562  CHECK(call_watchdog_lv);
1563  auto error_check_bb = bb_it->splitBasicBlock(
1564  llvm::BasicBlock::iterator(br_instr), ".error_check");
1565  auto& watchdog_br_instr = bb_it->back();
1566 
1567  auto watchdog_check_bb = llvm::BasicBlock::Create(
1568  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1569  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1570  auto detected_timeout = watchdog_ir_builder.CreateCall(
1571  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1572  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1573  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1574  watchdog_ir_builder.CreateBr(error_check_bb);
1575 
1576  llvm::ReplaceInstWithInst(
1577  &watchdog_br_instr,
1578  llvm::BranchInst::Create(
1579  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1580  ir_builder.SetInsertPoint(&br_instr);
1581  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1582 
1583  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1584  unified_err_lv->addIncoming(err_lv, &*bb_it);
1585  err_lv = unified_err_lv;
1586  } else if (run_with_allowing_runtime_interrupt) {
1587  CHECK(pos);
1588  llvm::Value* call_check_interrupt_lv = nullptr;
1589  if (device_type == ExecutorDeviceType::GPU) {
1590  // approximate how many times the %pos variable
1591  // is increased --> the number of iteration
1592  int32_t num_shift_by_gridDim = getExpOfTwo(gridSize());
1593  int32_t num_shift_by_blockDim = getExpOfTwo(blockSize());
1594  if (!isPowOfTwo(gridSize())) {
1595  num_shift_by_gridDim++;
1596  }
1597  if (!isPowOfTwo(blockSize())) {
1598  num_shift_by_blockDim++;
1599  }
1600  int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1601  // check the interrupt flag for every 64th iteration
1602  llvm::Value* pos_shifted_per_iteration =
1603  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1604  auto interrupt_predicate =
1605  ir_builder.CreateAnd(pos_shifted_per_iteration, uint64_t(0x3f));
1606  call_check_interrupt_lv =
1607  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1608  interrupt_predicate,
1609  cgen_state_->llInt(int64_t(0LL)));
1610  } else {
1611  // CPU path: run interrupt checker for every 64th row
1612  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1613  call_check_interrupt_lv =
1614  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1615  interrupt_predicate,
1616  cgen_state_->llInt(int64_t(0LL)));
1617  }
1618  CHECK(call_check_interrupt_lv);
1619  auto error_check_bb = bb_it->splitBasicBlock(
1620  llvm::BasicBlock::iterator(br_instr), ".error_check");
1621  auto& check_interrupt_br_instr = bb_it->back();
1622 
1623  auto interrupt_check_bb = llvm::BasicBlock::Create(
1624  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1625  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1626  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1627  cgen_state_->module_->getFunction("check_interrupt"), {});
1628  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1629  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
1630  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1631 
1632  llvm::ReplaceInstWithInst(
1633  &check_interrupt_br_instr,
1634  llvm::BranchInst::Create(
1635  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1636  ir_builder.SetInsertPoint(&br_instr);
1637  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1638 
1639  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1640  unified_err_lv->addIncoming(err_lv, &*bb_it);
1641  err_lv = unified_err_lv;
1642  }
1643  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1644  err_lv =
1645  ir_builder.CreateCall(cgen_state_->module_->getFunction("record_error_code"),
1646  std::vector<llvm::Value*>{err_lv, error_code_arg});
1647  if (device_type == ExecutorDeviceType::GPU) {
1648  // let kernel execution finish as expected, regardless of the observed error,
1649  // unless it is from the dynamic watchdog where all threads within that block
1650  // return together.
1651  if (run_with_allowing_runtime_interrupt) {
1652  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1653  err_lv,
1655  } else {
1656  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1657  err_lv,
1659  }
1660 
1661  } else {
1662  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1663  err_lv,
1664  cgen_state_->llInt(static_cast<int32_t>(0)));
1665  }
1666  auto error_bb = llvm::BasicBlock::Create(
1667  cgen_state_->context_, ".error_exit", query_func, new_bb);
1668  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1669  llvm::ReplaceInstWithInst(&br_instr,
1670  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1671  done_splitting = true;
1672  break;
1673  }
1674  }
1675  }
1676  CHECK(done_splitting);
1677 }
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:989
unsigned gridSize() const
Definition: Execute.cpp:3108
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.h:24
bool isPowOfTwo(unsigned n)
Definition: MathUtils.h:20
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:129
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:988
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:197
unsigned blockSize() const
Definition: Execute.cpp:3123
+ Here is the call graph for this function:

◆ createKernels()

std::vector< std::unique_ptr< ExecutionKernel > > Executor::createKernels ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
ColumnFetcher column_fetcher,
const std::vector< InputTableInfo > &  table_infos,
const ExecutionOptions eo,
const bool  is_agg,
const bool  allow_single_frag_table_opt,
const size_t  context_count,
const QueryCompilationDescriptor query_comp_desc,
const QueryMemoryDescriptor query_mem_desc,
RenderInfo render_info,
std::unordered_set< int > &  available_gpus,
int &  available_cpus 
)
private

Determines execution dispatch mode and required fragments for a given query step, then creates kernels to execute the query and returns them for launch.

Definition at line 1939 of file Execute.cpp.

References ExecutionOptions::allow_multifrag, catalog_, CHECK, CHECK_GE, CHECK_GT, anonymous_namespace{Execute.cpp}::checkWorkUnitWatchdog(), deviceCount(), g_inner_join_fragment_skipping, getColLazyFetchInfo(), Catalog_Namespace::Catalog::getDataMgr(), QueryCompilationDescriptor::getDeviceType(), QueryMemoryDescriptor::getEntryCount(), SharedKernelContext::getFragOffsets(), Data_Namespace::DataMgr::getMemoryInfo(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, ExecutionOptions::gpu_input_mem_limit_percent, Data_Namespace::GPU_LEVEL, anonymous_namespace{Execute.cpp}::has_lazy_fetched_columns(), logger::INFO, RelAlgExecutionUnit::input_descs, KernelPerFragment, LOG, MultifragmentKernel, ExecutionOptions::outer_fragment_indices, plan_state_, Projection, RelAlgExecutionUnit::target_exprs, QueryMemoryDescriptor::toString(), RelAlgExecutionUnit::use_bump_allocator, VLOG, and ExecutionOptions::with_watchdog.

Referenced by executeWorkUnitImpl().

1952  {
1953  std::vector<std::unique_ptr<ExecutionKernel>> execution_kernels;
1954 
1955  QueryFragmentDescriptor fragment_descriptor(
1956  ra_exe_unit,
1957  table_infos,
1958  query_comp_desc.getDeviceType() == ExecutorDeviceType::GPU
1960  : std::vector<Data_Namespace::MemoryInfo>{},
1963  CHECK(!ra_exe_unit.input_descs.empty());
1964 
1965  const auto device_type = query_comp_desc.getDeviceType();
1966  const bool uses_lazy_fetch =
1967  plan_state_->allow_lazy_fetch_ &&
1969  const bool use_multifrag_kernel = (device_type == ExecutorDeviceType::GPU) &&
1970  eo.allow_multifrag && (!uses_lazy_fetch || is_agg);
1971  const auto device_count = deviceCount(device_type);
1972  CHECK_GT(device_count, 0);
1973 
1974  fragment_descriptor.buildFragmentKernelMap(ra_exe_unit,
1975  shared_context.getFragOffsets(),
1976  device_count,
1977  device_type,
1978  use_multifrag_kernel,
1980  this);
1981  if (eo.with_watchdog && fragment_descriptor.shouldCheckWorkUnitWatchdog()) {
1982  checkWorkUnitWatchdog(ra_exe_unit, table_infos, *catalog_, device_type, device_count);
1983  }
1984 
1985  if (use_multifrag_kernel) {
1986  VLOG(1) << "Creating multifrag execution kernels";
1987  VLOG(1) << query_mem_desc.toString();
1988 
1989  // NB: We should never be on this path when the query is retried because of running
1990  // out of group by slots; also, for scan only queries on CPU we want the
1991  // high-granularity, fragment by fragment execution instead. For scan only queries on
1992  // GPU, we want the multifrag kernel path to save the overhead of allocating an output
1993  // buffer per fragment.
1994  auto multifrag_kernel_dispatch = [&ra_exe_unit,
1995  &execution_kernels,
1996  &column_fetcher,
1997  &eo,
1998  &query_comp_desc,
1999  &query_mem_desc,
2000  render_info](const int device_id,
2001  const FragmentsList& frag_list,
2002  const int64_t rowid_lookup_key) {
2003  execution_kernels.emplace_back(
2004  std::make_unique<ExecutionKernel>(ra_exe_unit,
2006  device_id,
2007  eo,
2008  column_fetcher,
2009  query_comp_desc,
2010  query_mem_desc,
2011  frag_list,
2013  render_info,
2014  rowid_lookup_key));
2015  };
2016  fragment_descriptor.assignFragsToMultiDispatch(multifrag_kernel_dispatch);
2017  } else {
2018  VLOG(1) << "Creating one execution kernel per fragment";
2019  VLOG(1) << query_mem_desc.toString();
2020 
2021  if (!ra_exe_unit.use_bump_allocator && allow_single_frag_table_opt &&
2022  (query_mem_desc.getQueryDescriptionType() == QueryDescriptionType::Projection) &&
2023  table_infos.size() == 1 && table_infos.front().table_id > 0) {
2024  const auto max_frag_size =
2025  table_infos.front().info.getFragmentNumTuplesUpperBound();
2026  if (max_frag_size < query_mem_desc.getEntryCount()) {
2027  LOG(INFO) << "Lowering scan limit from " << query_mem_desc.getEntryCount()
2028  << " to match max fragment size " << max_frag_size
2029  << " for kernel per fragment execution path.";
2030  throw CompilationRetryNewScanLimit(max_frag_size);
2031  }
2032  }
2033 
2034  size_t frag_list_idx{0};
2035  auto fragment_per_kernel_dispatch = [&ra_exe_unit,
2036  &execution_kernels,
2037  &column_fetcher,
2038  &eo,
2039  &frag_list_idx,
2040  &device_type,
2041  &query_comp_desc,
2042  &query_mem_desc,
2043  render_info](const int device_id,
2044  const FragmentsList& frag_list,
2045  const int64_t rowid_lookup_key) {
2046  if (!frag_list.size()) {
2047  return;
2048  }
2049  CHECK_GE(device_id, 0);
2050 
2051  execution_kernels.emplace_back(
2052  std::make_unique<ExecutionKernel>(ra_exe_unit,
2053  device_type,
2054  device_id,
2055  eo,
2056  column_fetcher,
2057  query_comp_desc,
2058  query_mem_desc,
2059  frag_list,
2061  render_info,
2062  rowid_lookup_key));
2063  ++frag_list_idx;
2064  };
2065 
2066  fragment_descriptor.assignFragsToKernelDispatch(fragment_per_kernel_dispatch,
2067  ra_exe_unit);
2068  }
2069 
2070  return execution_kernels;
2071 }
bool is_agg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
std::vector< ColumnLazyFetchInfo > getColLazyFetchInfo(const std::vector< Analyzer::Expr *> &target_exprs) const
Definition: Execute.cpp:306
const std::vector< uint64_t > & getFragOffsets()
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:209
#define LOG(tag)
Definition: Logger.h:188
std::vector< InputDescriptor > input_descs
#define CHECK_GE(x, y)
Definition: Logger.h:210
#define CHECK_GT(x, y)
Definition: Logger.h:209
std::vector< FragmentsPerTable > FragmentsList
bool g_inner_join_fragment_skipping
Definition: Execute.cpp:83
const bool allow_multifrag
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:616
const double gpu_input_mem_limit_percent
ExecutorDeviceType getDeviceType() const
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:914
void checkWorkUnitWatchdog(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const Catalog_Namespace::Catalog &cat, const ExecutorDeviceType device_type, const int device_count)
Definition: Execute.cpp:1070
std::vector< MemoryInfo > getMemoryInfo(const MemoryLevel memLevel)
Definition: DataMgr.cpp:298
const std::vector< size_t > outer_fragment_indices
#define CHECK(condition)
Definition: Logger.h:197
bool has_lazy_fetched_columns(const std::vector< ColumnLazyFetchInfo > &fetched_cols)
Definition: Execute.cpp:1928
#define VLOG(n)
Definition: Logger.h:291
const bool with_watchdog
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ deviceCount()

int Executor::deviceCount ( const ExecutorDeviceType  device_type) const
private

Definition at line 616 of file Execute.cpp.

References catalog_, CHECK, Data_Namespace::DataMgr::getCudaMgr(), Catalog_Namespace::Catalog::getDataMgr(), and GPU.

Referenced by createKernels(), and deviceCountForMemoryLevel().

616  {
617  if (device_type == ExecutorDeviceType::GPU) {
618  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
619  CHECK(cuda_mgr);
620  return cuda_mgr->getDeviceCount();
621  } else {
622  return 1;
623  }
624 }
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:209
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:208
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ deviceCountForMemoryLevel()

int Executor::deviceCountForMemoryLevel ( const Data_Namespace::MemoryLevel  memory_level) const
private

Definition at line 626 of file Execute.cpp.

References CPU, deviceCount(), GPU, and Data_Namespace::GPU_LEVEL.

Referenced by buildHashTableForQualifier().

627  {
628  return memory_level == GPU_LEVEL ? deviceCount(ExecutorDeviceType::GPU)
629  : deviceCount(ExecutorDeviceType::CPU);
630 }
ExecutorDeviceType
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:616
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ deviceCycles()

int64_t Executor::deviceCycles ( int  milliseconds) const
private

Definition at line 3135 of file Execute.cpp.

References catalog_, CHECK, Data_Namespace::DataMgr::getCudaMgr(), and Catalog_Namespace::Catalog::getDataMgr().

3135  {
3136  CHECK(catalog_);
3137  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
3138  CHECK(cuda_mgr);
3139  const auto& dev_props = cuda_mgr->getAllDeviceProperties();
3140  return static_cast<int64_t>(dev_props.front().clockKhz) * milliseconds;
3141 }
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:209
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:944
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:208
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:

◆ enableRuntimeQueryInterrupt()

void Executor::enableRuntimeQueryInterrupt ( const unsigned  interrupt_freq) const

Definition at line 3608 of file Execute.cpp.

References g_enable_runtime_query_interrupt, and g_runtime_query_interrupt_frequency.

3608  {
3609  // The only one scenario that we intentionally call this function is
3610  // to allow runtime query interrupt in QueryRunner for test cases.
3611  // Because test machine's default setting does not allow runtime query interrupt,
3612  // so we have to turn it on within test code if necessary.
3614  g_runtime_query_interrupt_frequency = interrupt_freq;
3615 }
unsigned g_runtime_query_interrupt_frequency
Definition: Execute.cpp:110
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:108

◆ executeExplain()

ResultSetPtr Executor::executeExplain ( const QueryCompilationDescriptor query_comp_desc)
private

Definition at line 1619 of file Execute.cpp.

References QueryCompilationDescriptor::getIR().

Referenced by executeWorkUnitImpl().

1619  {
1620  return std::make_shared<ResultSet>(query_comp_desc.getIR());
1621 }
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ executePlanWithGroupBy()

int32_t Executor::executePlanWithGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t *>> &  col_buffers,
const std::vector< size_t >  outer_tab_frag_ids,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const int  outer_table_id,
const int64_t  limit,
const uint32_t  start_rowid,
const uint32_t  num_tables,
RenderInfo render_info 
)
private

Definition at line 2848 of file Execute.cpp.

References blockSize(), CHECK, CHECK_NE, anonymous_namespace{Execute.cpp}::check_rows_less_than_needed(), CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_GPU_MEM, ERR_OUT_OF_RENDER_MEM, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY, error_code, logger::FATAL, g_enable_dynamic_watchdog, g_enable_runtime_query_interrupt, CompilationResult::generated_code, QueryMemoryDescriptor::getEntryCount(), getJoinHashTablePtrs(), QueryExecutionContext::getRowSet(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, gridSize(), RelAlgExecutionUnit::groupby_exprs, INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, interrupted_, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, num_rows, shared::printContainer(), QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, RenderInfo::render_allocator_map_ptr, RelAlgExecutionUnit::scan_limit, serializeLiterals(), QueryMemoryDescriptor::setEntryCount(), RelAlgExecutionUnit::union_all, RenderInfo::useCudaBuffers(), and VLOG.

2865  {
2866  auto timer = DEBUG_TIMER(__func__);
2868  CHECK(!results);
2869  if (col_buffers.empty()) {
2870  return 0;
2871  }
2872  CHECK_NE(ra_exe_unit.groupby_exprs.size(), size_t(0));
2873  // TODO(alex):
2874  // 1. Optimize size (make keys more compact).
2875  // 2. Resize on overflow.
2876  // 3. Optimize runtime.
2877  auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
2878  int32_t error_code = device_type == ExecutorDeviceType::GPU ? 0 : start_rowid;
2879  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
2881  interrupted_.load()) {
2882  return ERR_INTERRUPTED;
2883  }
2884 
2885  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
2886  if (render_info && render_info->useCudaBuffers()) {
2887  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
2888  }
2889 
2890  VLOG(2) << "bool(ra_exe_unit.union_all)=" << bool(ra_exe_unit.union_all)
2891  << " ra_exe_unit.input_descs="
2892  << shared::printContainer(ra_exe_unit.input_descs)
2893  << " ra_exe_unit.input_col_descs="
2894  << shared::printContainer(ra_exe_unit.input_col_descs)
2895  << " ra_exe_unit.scan_limit=" << ra_exe_unit.scan_limit
2896  << " num_rows=" << shared::printContainer(num_rows)
2897  << " frag_offsets=" << shared::printContainer(frag_offsets)
2898  << " query_exe_context->query_buffers_->num_rows_="
2899  << query_exe_context->query_buffers_->num_rows_
2900  << " query_exe_context->query_mem_desc_.getEntryCount()="
2901  << query_exe_context->query_mem_desc_.getEntryCount()
2902  << " device_id=" << device_id << " outer_table_id=" << outer_table_id
2903  << " scan_limit=" << scan_limit << " start_rowid=" << start_rowid
2904  << " num_tables=" << num_tables;
2905 
2906  RelAlgExecutionUnit ra_exe_unit_copy = ra_exe_unit;
2907  // For UNION ALL, filter out input_descs and input_col_descs that are not associated
2908  // with outer_table_id.
2909  if (ra_exe_unit_copy.union_all) {
2910  // Sort outer_table_id first, then pop the rest off of ra_exe_unit_copy.input_descs.
2911  std::stable_sort(ra_exe_unit_copy.input_descs.begin(),
2912  ra_exe_unit_copy.input_descs.end(),
2913  [outer_table_id](auto const& a, auto const& b) {
2914  return a.getTableId() == outer_table_id &&
2915  b.getTableId() != outer_table_id;
2916  });
2917  while (!ra_exe_unit_copy.input_descs.empty() &&
2918  ra_exe_unit_copy.input_descs.back().getTableId() != outer_table_id) {
2919  ra_exe_unit_copy.input_descs.pop_back();
2920  }
2921  // Filter ra_exe_unit_copy.input_col_descs.
2922  ra_exe_unit_copy.input_col_descs.remove_if(
2923  [outer_table_id](auto const& input_col_desc) {
2924  return input_col_desc->getScanDesc().getTableId() != outer_table_id;
2925  });
2926  query_exe_context->query_mem_desc_.setEntryCount(ra_exe_unit_copy.scan_limit);
2927  }
2928 
2929  if (device_type == ExecutorDeviceType::CPU) {
2930  auto cpu_generated_code = std::dynamic_pointer_cast<CpuCompilationContext>(
2931  compilation_result.generated_code);
2932  CHECK(cpu_generated_code);
2933  query_exe_context->launchCpuCode(
2934  ra_exe_unit_copy,
2935  cpu_generated_code.get(),
2936  hoist_literals,
2937  hoist_buf,
2938  col_buffers,
2939  num_rows,
2940  frag_offsets,
2941  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit,
2942  &error_code,
2943  num_tables,
2944  join_hash_table_ptrs);
2945  } else {
2946  try {
2947  auto gpu_generated_code = std::dynamic_pointer_cast<GpuCompilationContext>(
2948  compilation_result.generated_code);
2949  CHECK(gpu_generated_code);
2950  query_exe_context->launchGpuCode(
2951  ra_exe_unit_copy,
2952  gpu_generated_code.get(),
2953  hoist_literals,
2954  hoist_buf,
2955  col_buffers,
2956  num_rows,
2957  frag_offsets,
2958  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit,
2959  data_mgr,
2960  blockSize(),
2961  gridSize(),
2962  device_id,
2963  compilation_result.gpu_smem_context.getSharedMemorySize(),
2964  &error_code,
2965  num_tables,
2966  join_hash_table_ptrs,
2967  render_allocator_map_ptr);
2968  } catch (const OutOfMemory&) {
2969  return ERR_OUT_OF_GPU_MEM;
2970  } catch (const OutOfRenderMemory&) {
2971  return ERR_OUT_OF_RENDER_MEM;
2972  } catch (const StreamingTopNNotSupportedInRenderQuery&) {
2974  } catch (const std::exception& e) {
2975  LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
2976  }
2977  }
2978 
2979  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
2980  error_code == Executor::ERR_DIV_BY_ZERO ||
2981  error_code == Executor::ERR_OUT_OF_TIME ||
2982  error_code == Executor::ERR_INTERRUPTED ||
2984  error_code == Executor::ERR_GEOS) {
2985  return error_code;
2986  }
2987 
2988  if (error_code != Executor::ERR_OVERFLOW_OR_UNDERFLOW &&
2989  error_code != Executor::ERR_DIV_BY_ZERO && !render_allocator_map_ptr) {
2990  results = query_exe_context->getRowSet(ra_exe_unit_copy,
2991  query_exe_context->query_mem_desc_);
2992  CHECK(results);
2993  VLOG(2) << "results->rowCount()=" << results->rowCount();
2994  results->holdLiterals(hoist_buf);
2995  }
2996  if (error_code < 0 && render_allocator_map_ptr) {
2997  auto const adjusted_scan_limit =
2998  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
2999  // More rows passed the filter than available slots. We don't have a count to check,
3000  // so assume we met the limit if a scan limit is set
3001  if (adjusted_scan_limit != 0) {
3002  return 0;
3003  } else {
3004  return error_code;
3005  }
3006  }
3007  if (error_code && (!scan_limit || check_rows_less_than_needed(results, scan_limit))) {
3008  return error_code; // unlucky, not enough results and we ran out of slots
3009  }
3010 
3011  return 0;
3012 }
std::vector< int64_t * > launchGpuCode(const RelAlgExecutionUnit &ra_exe_unit, const GpuCompilationContext *cu_functions, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t *>> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables, RenderAllocatorMap *render_allocator_map)
int32_t executePlanWithGroupBy(const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const int outer_table_id, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, RenderInfo *render_info)
Definition: Execute.cpp:2848
const int8_t const int64_t * num_rows
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:989
void setEntryCount(const size_t val)
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t *>> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables)
GpuSharedMemoryContext gpu_smem_context
const std::optional< bool > union_all
#define LOG(tag)
Definition: Logger.h:188
unsigned gridSize() const
Definition: Execute.cpp:3108
std::vector< InputDescriptor > input_descs
static const int32_t ERR_GEOS
Definition: Execute.h:995
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:75
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
std::unique_ptr< QueryMemoryInitializer > query_buffers_
std::vector< int64_t > getJoinHashTablePtrs(const ExecutorDeviceType device_type, const int device_id)
Definition: Execute.cpp:3014
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY
Definition: Execute.h:993
static const int32_t ERR_DIV_BY_ZERO
Definition: Execute.h:981
#define INJECT_TIMER(DESC)
Definition: measure.h:93
#define CHECK_NE(x, y)
Definition: Logger.h:206
static const int32_t ERR_OUT_OF_RENDER_MEM
Definition: Execute.h:985
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t * error_code
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:987
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:988
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:994
ResultSetPtr getRowSet(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc) const
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:982
std::shared_ptr< CompilationContext > generated_code
QueryMemoryDescriptor query_mem_desc_
std::vector< int8_t > serializeLiterals(const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
Definition: Execute.cpp:354
std::unordered_map< int, CgenState::LiteralValues > literal_values
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
Definition: RenderInfo.h:32
bool check_rows_less_than_needed(const ResultSetPtr &results, const size_t scan_limit)
Definition: Execute.cpp:2841
static std::atomic< bool > interrupted_
Definition: Execute.h:923
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:64
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:108
#define VLOG(n)
Definition: Logger.h:291
unsigned blockSize() const
Definition: Execute.cpp:3123
+ Here is the call graph for this function:

◆ executePlanWithoutGroupBy()

int32_t Executor::executePlanWithoutGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const std::vector< Analyzer::Expr *> &  target_exprs,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t *>> &  col_buffers,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const uint32_t  start_rowid,
const uint32_t  num_tables,
RenderInfo render_info 
)
private

Definition at line 2648 of file Execute.cpp.

References blockSize(), CHECK, CHECK_EQ, CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_GPU_MEM, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, error_code, RelAlgExecutionUnit::estimator, QueryExecutionContext::estimator_result_set_, logger::FATAL, g_bigint_count, g_enable_dynamic_watchdog, g_enable_runtime_query_interrupt, CompilationResult::generated_code, get_target_info(), QueryExecutionContext::getAggInitValForIndex(), getJoinHashTablePtrs(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, gridSize(), INJECT_TIMER, interrupted_, is_distinct_target(), RenderInfo::isPotentialInSituRender(), GpuSharedMemoryContext::isSharedMemoryUsed(), kAPPROX_COUNT_DISTINCT, kAVG, kCOUNT, kSAMPLE, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, num_rows, out, QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, reduceResults(), RenderInfo::render_allocator_map_ptr, resetInterrupt(), serializeLiterals(), takes_float_argument(), and RenderInfo::useCudaBuffers().

2663  {
2665  auto timer = DEBUG_TIMER(__func__);
2666  CHECK(!results);
2667  if (col_buffers.empty()) {
2668  return 0;
2669  }
2670 
2671  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
2672  if (render_info) {
2673  // TODO(adb): make sure that we either never get here in the CPU case, or if we do get
2674  // here, we are in non-insitu mode.
2675  CHECK(render_info->useCudaBuffers() || !render_info->isPotentialInSituRender())
2676  << "CUDA disabled rendering in the executePlanWithoutGroupBy query path is "
2677  "currently unsupported.";
2678  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
2679  }
2680 
2681  int32_t error_code = device_type == ExecutorDeviceType::GPU ? 0 : start_rowid;
2682  std::vector<int64_t*> out_vec;
2683  const auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
2684  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
2685  std::unique_ptr<OutVecOwner> output_memory_scope;
2687  interrupted_.load()) {
2688  resetInterrupt();
2690  }
2691  if (device_type == ExecutorDeviceType::CPU) {
2692  auto cpu_generated_code = std::dynamic_pointer_cast<CpuCompilationContext>(
2693  compilation_result.generated_code);
2694  CHECK(cpu_generated_code);
2695  out_vec = query_exe_context->launchCpuCode(ra_exe_unit,
2696  cpu_generated_code.get(),
2697  hoist_literals,
2698  hoist_buf,
2699  col_buffers,
2700  num_rows,
2701  frag_offsets,
2702  0,
2703  &error_code,
2704  num_tables,
2705  join_hash_table_ptrs);
2706  output_memory_scope.reset(new OutVecOwner(out_vec));
2707  } else {
2708  auto gpu_generated_code = std::dynamic_pointer_cast<GpuCompilationContext>(
2709  compilation_result.generated_code);
2710  CHECK(gpu_generated_code);
2711  try {
2712  out_vec = query_exe_context->launchGpuCode(
2713  ra_exe_unit,
2714  gpu_generated_code.get(),
2715  hoist_literals,
2716  hoist_buf,
2717  col_buffers,
2718  num_rows,
2719  frag_offsets,
2720  0,
2721  data_mgr,
2722  blockSize(),
2723  gridSize(),
2724  device_id,
2725  compilation_result.gpu_smem_context.getSharedMemorySize(),
2726  &error_code,
2727  num_tables,
2728  join_hash_table_ptrs,
2729  render_allocator_map_ptr);
2730  output_memory_scope.reset(new OutVecOwner(out_vec));
2731  } catch (const OutOfMemory&) {
2732  return ERR_OUT_OF_GPU_MEM;
2733  } catch (const std::exception& e) {
2734  LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
2735  }
2736  }
2737  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
2738  error_code == Executor::ERR_DIV_BY_ZERO ||
2739  error_code == Executor::ERR_OUT_OF_TIME ||
2740  error_code == Executor::ERR_INTERRUPTED ||
2742  error_code == Executor::ERR_GEOS) {
2743  return error_code;
2744  }
2745  if (ra_exe_unit.estimator) {
2746  CHECK(!error_code);
2747  results =
2748  std::shared_ptr<ResultSet>(query_exe_context->estimator_result_set_.release());
2749  return 0;
2750  }
2751  std::vector<int64_t> reduced_outs;
2752  const auto num_frags = col_buffers.size();
2753  const size_t entry_count =
2754  device_type == ExecutorDeviceType::GPU
2755  ? (compilation_result.gpu_smem_context.isSharedMemoryUsed()
2756  ? 1
2757  : blockSize() * gridSize() * num_frags)
2758  : num_frags;
2759  if (size_t(1) == entry_count) {
2760  for (auto out : out_vec) {
2761  CHECK(out);
2762  reduced_outs.push_back(*out);
2763  }
2764  } else {
2765  size_t out_vec_idx = 0;
2766 
2767  for (const auto target_expr : target_exprs) {
2768  const auto agg_info = get_target_info(target_expr, g_bigint_count);
2769  CHECK(agg_info.is_agg);
2770 
2771  const int num_iterations = agg_info.sql_type.is_geometry()
2772  ? agg_info.sql_type.get_physical_coord_cols()
2773  : 1;
2774 
2775  for (int i = 0; i < num_iterations; i++) {
2776  int64_t val1;
2777  const bool float_argument_input = takes_float_argument(agg_info);
2778  if (is_distinct_target(agg_info)) {
2779  CHECK(agg_info.agg_kind == kCOUNT ||
2780  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
2781  val1 = out_vec[out_vec_idx][0];
2782  error_code = 0;
2783  } else {
2784  const auto chosen_bytes = static_cast<size_t>(
2785  query_exe_context->query_mem_desc_.getPaddedSlotWidthBytes(out_vec_idx));
2786  std::tie(val1, error_code) = Executor::reduceResults(
2787  agg_info.agg_kind,
2788  agg_info.sql_type,
2789  query_exe_context->getAggInitValForIndex(out_vec_idx),
2790  float_argument_input ? sizeof(int32_t) : chosen_bytes,
2791  out_vec[out_vec_idx],
2792  entry_count,
2793  false,
2794  float_argument_input);
2795  }
2796  if (error_code) {
2797  break;
2798  }
2799  reduced_outs.push_back(val1);
2800  if (agg_info.agg_kind == kAVG ||
2801  (agg_info.agg_kind == kSAMPLE &&
2802  (agg_info.sql_type.is_varlen() || agg_info.sql_type.is_geometry()))) {
2803  const auto chosen_bytes = static_cast<size_t>(
2804  query_exe_context->query_mem_desc_.getPaddedSlotWidthBytes(out_vec_idx +
2805  1));
2806  int64_t val2;
2807  std::tie(val2, error_code) = Executor::reduceResults(
2808  agg_info.agg_kind == kAVG ? kCOUNT : agg_info.agg_kind,
2809  agg_info.sql_type,
2810  query_exe_context->getAggInitValForIndex(out_vec_idx + 1),
2811  float_argument_input ? sizeof(int32_t) : chosen_bytes,
2812  out_vec[out_vec_idx + 1],
2813  entry_count,
2814  false,
2815  false);
2816  if (error_code) {
2817  break;
2818  }
2819  reduced_outs.push_back(val2);
2820  ++out_vec_idx;
2821  }
2822  ++out_vec_idx;
2823  }
2824  }
2825  }
2826 
2827  if (error_code) {
2828  return error_code;
2829  }
2830 
2831  CHECK_EQ(size_t(1), query_exe_context->query_buffers_->result_sets_.size());
2832  auto rows_ptr = std::shared_ptr<ResultSet>(
2833  query_exe_context->query_buffers_->result_sets_[0].release());
2834  rows_ptr->fillOneEntry(reduced_outs);
2835  results = std::move(rows_ptr);
2836  return error_code;
2837 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::vector< int64_t * > launchGpuCode(const RelAlgExecutionUnit &ra_exe_unit, const GpuCompilationContext *cu_functions, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t *>> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables, RenderAllocatorMap *render_allocator_map)
const int8_t const int64_t * num_rows
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:989
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t *>> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables)
GpuSharedMemoryContext gpu_smem_context
#define LOG(tag)
Definition: Logger.h:188
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:78
unsigned gridSize() const
Definition: Execute.cpp:3108
int32_t executePlanWithoutGroupBy(const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const std::vector< Analyzer::Expr *> &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t *>> &col_buffers, QueryExecutionContext *query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, RenderInfo *render_info)
Definition: Execute.cpp:2648
static std::pair< int64_t, int32_t > reduceResults(const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)
Definition: Execute.cpp:633
static const int32_t ERR_GEOS
Definition: Execute.h:995
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:75
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:133
int64_t getAggInitValForIndex(const size_t index) const
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
std::unique_ptr< QueryMemoryInitializer > query_buffers_
std::vector< int64_t > getJoinHashTablePtrs(const ExecutorDeviceType device_type, const int device_id)
Definition: Execute.cpp:3014
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
static const int32_t ERR_DIV_BY_ZERO
Definition: Execute.h:981
#define INJECT_TIMER(DESC)
Definition: measure.h:93
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t * error_code
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:987
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:988
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:129
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:994
const std::shared_ptr< Analyzer::Estimator > estimator
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:982
std::shared_ptr< CompilationContext > generated_code
QueryMemoryDescriptor query_mem_desc_
std::vector< int8_t > serializeLiterals(const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
Definition: Execute.cpp:354
Definition: sqldefs.h:76
std::unordered_map< int, CgenState::LiteralValues > literal_values
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
Definition: RenderInfo.h:32
void resetInterrupt()
static std::atomic< bool > interrupted_
Definition: Execute.h:923
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t ** out
std::unique_ptr< ResultSet > estimator_result_set_
Definition: sqldefs.h:72
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:108
unsigned blockSize() const
Definition: Execute.cpp:3123
+ Here is the call graph for this function:

◆ executeTableFunction()

ResultSetPtr Executor::executeTableFunction ( const TableFunctionExecutionUnit  exe_unit,
const std::vector< InputTableInfo > &  table_infos,
const CompilationOptions co,
const ExecutionOptions eo,
const Catalog_Namespace::Catalog cat 
)
private

Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps.

Definition at line 1598 of file Execute.cpp.

References TableFunctionCompilationContext::compile(), CompilationOptions::device_type, TableFunctionExecutionContext::execute(), getRowSetMemoryOwner(), INJECT_TIMER, and nukeOldState().

1603  {
1604  INJECT_TIMER(Exec_executeTableFunction);
1605  nukeOldState(false, table_infos, PlanState::DeletedColumnsMap{}, nullptr);
1606 
1607  ColumnCacheMap column_cache; // Note: if we add retries to the table function
1608  // framework, we may want to move this up a level
1609 
1610  ColumnFetcher column_fetcher(this, column_cache);
1611  TableFunctionCompilationContext compilation_context;
1612  compilation_context.compile(exe_unit, co, this);
1613 
1615  return exe_context.execute(
1616  exe_unit, table_infos, &compilation_context, column_fetcher, co.device_type, this);
1617 }
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults > >> ColumnCacheMap
const std::shared_ptr< RowSetMemoryOwner > getRowSetMemoryOwner() const
Definition: Execute.cpp:256
#define INJECT_TIMER(DESC)
Definition: measure.h:93
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
ExecutorDeviceType device_type
void nukeOldState(const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
Definition: Execute.cpp:3029
void compile(const TableFunctionExecutionUnit &exe_unit, const CompilationOptions &co, Executor *executor)
+ Here is the call graph for this function:

◆ executeUpdate()

void Executor::executeUpdate ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  table_infos,
const CompilationOptions co,
const ExecutionOptions eo,
const Catalog_Namespace::Catalog cat,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const UpdateLogForFragment::Callback cb,
const bool  is_agg 
)

Definition at line 62 of file ExecuteUpdate.cpp.

References CHECK, CHECK_EQ, CHECK_GT, CPU, FragmentsPerTable::fragment_ids, SharedKernelContext::getFragmentResults(), KernelPerFragment, ExecutionKernel::run(), timer_start(), timer_stop(), and VLOG.

69  {
70  CHECK(cb);
71  VLOG(1) << "Executor " << executor_id_
72  << " is executing update/delete work unit:" << ra_exe_unit_in;
73 
74  const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
75  ColumnCacheMap column_cache;
76 
77  ColumnFetcher column_fetcher(this, column_cache);
78  CHECK_GT(ra_exe_unit.input_descs.size(), size_t(0));
79  const auto table_id = ra_exe_unit.input_descs[0].getTableId();
80  const auto& outer_fragments = table_infos.front().info.fragments;
81 
82  std::vector<FragmentsPerTable> fragments = {{0, {0}}};
83  for (size_t tab_idx = 1; tab_idx < ra_exe_unit.input_descs.size(); tab_idx++) {
84  int table_id = ra_exe_unit.input_descs[tab_idx].getTableId();
85  CHECK_EQ(table_infos[tab_idx].table_id, table_id);
86  const auto& fragmentsPerTable = table_infos[tab_idx].info.fragments;
87  FragmentsPerTable entry = {table_id, {}};
88  for (size_t innerFragId = 0; innerFragId < fragmentsPerTable.size(); innerFragId++) {
89  entry.fragment_ids.push_back(innerFragId);
90  }
91  fragments.push_back(entry);
92  }
93 
94  if (outer_fragments.empty()) {
95  return;
96  }
97 
98  const auto max_tuple_count_fragment_it = std::max_element(
99  outer_fragments.begin(), outer_fragments.end(), [](const auto& a, const auto& b) {
100  return a.getNumTuples() < b.getNumTuples();
101  });
102  CHECK(max_tuple_count_fragment_it != outer_fragments.end());
103  int64_t global_max_groups_buffer_entry_guess =
104  max_tuple_count_fragment_it->getNumTuples();
105  if (is_agg) {
106  global_max_groups_buffer_entry_guess = std::min(
107  2 * global_max_groups_buffer_entry_guess, static_cast<int64_t>(100'000'000));
108  }
109 
110  auto query_comp_desc = std::make_unique<QueryCompilationDescriptor>();
111  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
112  {
113  auto clock_begin = timer_start();
114  std::lock_guard<std::mutex> compilation_lock(compilation_mutex_);
115  compilation_queue_time_ms_ += timer_stop(clock_begin);
116 
117  query_mem_desc = query_comp_desc->compile(global_max_groups_buffer_entry_guess,
118  8,
119  /*has_cardinality_estimation=*/true,
120  ra_exe_unit,
121  table_infos,
122  deleted_cols_map,
123  column_fetcher,
124  co,
125  eo,
126  nullptr,
127  this);
128  }
129  CHECK(query_mem_desc);
130 
131  for (size_t fragment_index = 0; fragment_index < outer_fragments.size();
132  ++fragment_index) {
133  const int64_t crt_fragment_tuple_count =
134  outer_fragments[fragment_index].getNumTuples();
135  if (crt_fragment_tuple_count == 0) {
136  // nothing to update
137  continue;
138  }
139 
140  SharedKernelContext shared_context(table_infos);
141  fragments[0] = {table_id, {fragment_index}};
142  {
143  ExecutionKernel current_fragment_kernel(ra_exe_unit,
145  0,
146  eo,
147  column_fetcher,
148  *query_comp_desc,
149  *query_mem_desc,
150  fragments,
152  /*render_info=*/nullptr,
153  /*rowid_lookup_key=*/-1);
154 
155  auto clock_begin = timer_start();
156  std::lock_guard<std::mutex> kernel_lock(kernel_mutex_);
157  kernel_queue_time_ms_ += timer_stop(clock_begin);
158 
159  current_fragment_kernel.run(this, shared_context);
160  }
161  const auto& proj_fragment_results = shared_context.getFragmentResults();
162  if (proj_fragment_results.empty()) {
163  continue;
164  }
165  const auto& proj_fragment_result = proj_fragment_results[0];
166  const auto proj_result_set = proj_fragment_result.first;
167  CHECK(proj_result_set);
168  cb({outer_fragments[fragment_index], fragment_index, proj_result_set});
169  }
170 }
bool is_agg(const Analyzer::Expr *expr)
#define CHECK_EQ(x, y)
Definition: Logger.h:205
int64_t kernel_queue_time_ms_
Definition: Execute.h:947
int64_t compilation_queue_time_ms_
Definition: Execute.h:948
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults > >> ColumnCacheMap
std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > addDeletedColumn(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
Definition: Execute.cpp:3195
std::vector< InputDescriptor > input_descs
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
static std::mutex kernel_mutex_
Definition: Execute.h:998
#define CHECK_GT(x, y)
Definition: Logger.h:209
const ExecutorId executor_id_
Definition: Execute.h:943
#define CHECK(condition)
Definition: Logger.h:197
static std::mutex compilation_mutex_
Definition: Execute.h:997
std::vector< size_t > fragment_ids
#define VLOG(n)
Definition: Logger.h:291
Type timer_start()
Definition: measure.h:42
+ Here is the call graph for this function:

◆ executeWorkUnit()

ResultSetPtr Executor::executeWorkUnit ( size_t &  max_groups_buffer_entry_guess,
const bool  is_agg,
const std::vector< InputTableInfo > &  query_infos,
const RelAlgExecutionUnit ra_exe_unit_in,
const CompilationOptions co,
const ExecutionOptions options,
const Catalog_Namespace::Catalog cat,
RenderInfo render_info,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache 
)

Definition at line 1311 of file Execute.cpp.

References cgen_state_, compilation_queue_time_ms_, executeWorkUnitImpl(), executor_id_, kernel_queue_time_ms_, CompilationRetryNewScanLimit::new_scan_limit_, plan_state_, anonymous_namespace{Execute.cpp}::replace_scan_limit(), run_benchmark_import::result, row_set_mem_owner_, and VLOG.

1320  {
1321  VLOG(1) << "Executor " << executor_id_ << " is executing work unit:" << ra_exe_unit_in;
1322 
1323  ScopeGuard cleanup_post_execution = [this] {
1324  // cleanup/unpin GPU buffer allocations
1325  // TODO: separate out this state into a single object
1326  plan_state_.reset(nullptr);
1327  if (cgen_state_) {
1328  cgen_state_->in_values_bitmaps_.clear();
1329  }
1330  };
1331 
1332  try {
1333  auto result = executeWorkUnitImpl(max_groups_buffer_entry_guess,
1334  is_agg,
1335  true,
1336  query_infos,
1337  ra_exe_unit_in,
1338  co,
1339  eo,
1340  cat,
1342  render_info,
1343  has_cardinality_estimation,
1344  column_cache);
1345  if (result) {
1346  result->setKernelQueueTime(kernel_queue_time_ms_);
1347  result->addCompilationQueueTime(compilation_queue_time_ms_);
1348  }
1349  return result;
1350  } catch (const CompilationRetryNewScanLimit& e) {
1351  auto result =
1352  executeWorkUnitImpl(max_groups_buffer_entry_guess,
1353  is_agg,
1354  false,
1355  query_infos,
1356  replace_scan_limit(ra_exe_unit_in, e.new_scan_limit_),
1357  co,
1358  eo,
1359  cat,
1361  render_info,
1362  has_cardinality_estimation,
1363  column_cache);
1364  if (result) {
1365  result->setKernelQueueTime(kernel_queue_time_ms_);
1366  result->addCompilationQueueTime(compilation_queue_time_ms_);
1367  }
1368  return result;
1369  }
1370 }
bool is_agg(const Analyzer::Expr *expr)
int64_t kernel_queue_time_ms_
Definition: Execute.h:947
int64_t compilation_queue_time_ms_
Definition: Execute.h:948
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:899
const ExecutorId executor_id_
Definition: Execute.h:943
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:915
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:914
RelAlgExecutionUnit replace_scan_limit(const RelAlgExecutionUnit &ra_exe_unit_in, const size_t new_scan_limit)
Definition: Execute.cpp:1292
ResultSetPtr executeWorkUnitImpl(size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
Definition: Execute.cpp:1372
#define VLOG(n)
Definition: Logger.h:291
+ Here is the call graph for this function:

◆ executeWorkUnitImpl()

ResultSetPtr Executor::executeWorkUnitImpl ( size_t &  max_groups_buffer_entry_guess,
const bool  is_agg,
const bool  allow_single_frag_table_opt,
const std::vector< InputTableInfo > &  query_infos,
const RelAlgExecutionUnit ra_exe_unit_in,
const CompilationOptions co,
const ExecutionOptions options,
const Catalog_Namespace::Catalog cat,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
RenderInfo render_info,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache 
)
private

Definition at line 1372 of file Execute.cpp.

References addDeletedColumn(), CompilationOptions::allow_lazy_fetch, ExecutionOptions::allow_runtime_query_interrupt, CHECK, collectAllDeviceResults(), compilation_mutex_, compilation_queue_time_ms_, anonymous_namespace{Execute.cpp}::compute_buffer_entry_guess(), CPU, cpu_threads(), createKernels(), CompilationOptions::device_type, ERR_INTERRUPTED, ERR_OUT_OF_SLOTS, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, executeExplain(), ExecutionOptions::executor_type, CompilationOptions::explain_type, CompilationOptions::filter_on_deleted_column, g_use_tbb_pool, get_available_gpus(), get_context_count(), get_min_byte_width(), getDeviceTypeForTargets(), QueryExecutionError::getErrorCode(), CompilationOptions::hoist_literals, INJECT_TIMER, interrupted_, ExecutionOptions::just_explain, ExecutionOptions::just_validate, MAX_BYTE_WIDTH_SUPPORTED, Native, CompilationOptions::opt_level, plan_state_, Projection, QueryMemoryDescriptor, CompilationOptions::register_intel_jit_listener, resetInterrupt(), resultsUnion(), timer_start(), timer_stop(), VLOG, CompilationOptions::with_dynamic_watchdog, and ExecutionOptions::with_dynamic_watchdog.

Referenced by executeWorkUnit().

1384  {
1385  INJECT_TIMER(Exec_executeWorkUnit);
1386  const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
1387  const auto device_type = getDeviceTypeForTargets(ra_exe_unit, co.device_type);
1388  CHECK(!query_infos.empty());
1389  if (!max_groups_buffer_entry_guess) {
1390  // The query has failed the first execution attempt because of running out
1391  // of group by slots. Make the conservative choice: allocate fragment size
1392  // slots and run on the CPU.
1393  CHECK(device_type == ExecutorDeviceType::CPU);
1394  max_groups_buffer_entry_guess = compute_buffer_entry_guess(query_infos);
1395  }
1396 
1397  int8_t crt_min_byte_width{get_min_byte_width()};
1398  do {
1399  SharedKernelContext shared_context(query_infos);
1400  ColumnFetcher column_fetcher(this, column_cache);
1401  auto query_comp_desc_owned = std::make_unique<QueryCompilationDescriptor>();
1402  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc_owned;
1403  if (eo.executor_type == ExecutorType::Native) {
1404  try {
1405  INJECT_TIMER(query_step_compilation);
1406  auto clock_begin = timer_start();
1407  std::lock_guard<std::mutex> compilation_lock(compilation_mutex_);
1408  compilation_queue_time_ms_ += timer_stop(clock_begin);
1409 
1410  query_mem_desc_owned =
1411  query_comp_desc_owned->compile(max_groups_buffer_entry_guess,
1412  crt_min_byte_width,
1413  has_cardinality_estimation,
1414  ra_exe_unit,
1415  query_infos,
1416  deleted_cols_map,
1417  column_fetcher,
1418  {device_type,
1419  co.hoist_literals,
1420  co.opt_level,
1422  co.allow_lazy_fetch,
1424  co.explain_type,
1426  eo,
1427  render_info,
1428  this);
1429  CHECK(query_mem_desc_owned);
1430  crt_min_byte_width = query_comp_desc_owned->getMinByteWidth();
1431  } catch (CompilationRetryNoCompaction&) {
1432  crt_min_byte_width = MAX_BYTE_WIDTH_SUPPORTED;
1433  continue;
1434  }
1435  } else {
1436  plan_state_.reset(new PlanState(false, query_infos, deleted_cols_map, this));
1437  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
1438  CHECK(!query_mem_desc_owned);
1439  query_mem_desc_owned.reset(
1441  }
1442  if (eo.just_explain) {
1443  return executeExplain(*query_comp_desc_owned);
1444  }
1445 
1446  for (const auto target_expr : ra_exe_unit.target_exprs) {
1447  plan_state_->target_exprs_.push_back(target_expr);
1448  }
1449 
1450  if (!eo.just_validate) {
1451  int available_cpus = cpu_threads();
1452  auto available_gpus = get_available_gpus(cat);
1453 
1454  const auto context_count =
1455  get_context_count(device_type, available_cpus, available_gpus.size());
1456  try {
1457  auto kernels = createKernels(shared_context,
1458  ra_exe_unit,
1459  column_fetcher,
1460  query_infos,
1461  eo,
1462  is_agg,
1463  allow_single_frag_table_opt,
1464  context_count,
1465  *query_comp_desc_owned,
1466  *query_mem_desc_owned,
1467  render_info,
1468  available_gpus,
1469  available_cpus);
1470  if (g_use_tbb_pool) {
1471 #ifdef HAVE_TBB
1472  VLOG(1) << "Using TBB thread pool for kernel dispatch.";
1473  launchKernels<threadpool::TbbThreadPool<void>>(shared_context,
1474  std::move(kernels));
1475 #else
1476  throw std::runtime_error(
1477  "This build is not TBB enabled. Restart the server with "
1478  "\"enable-modern-thread-pool\" disabled.");
1479 #endif
1480  } else {
1481  launchKernels<threadpool::FuturesThreadPool<void>>(shared_context,
1482  std::move(kernels));
1483  }
1484  } catch (QueryExecutionError& e) {
1485  if (eo.with_dynamic_watchdog && interrupted_.load() &&
1486  e.getErrorCode() == ERR_OUT_OF_TIME) {
1487  resetInterrupt();
1489  }
1490  if (eo.allow_runtime_query_interrupt && interrupted_.load()) {
1491  resetInterrupt();
1493  }
1495  static_cast<size_t>(crt_min_byte_width << 1) <= sizeof(int64_t)) {
1496  crt_min_byte_width <<= 1;
1497  continue;
1498  }
1499  throw;
1500  }
1501  }
1502  if (is_agg) {
1503  try {
1504  return collectAllDeviceResults(shared_context,
1505  ra_exe_unit,
1506  *query_mem_desc_owned,
1507  query_comp_desc_owned->getDeviceType(),
1508  row_set_mem_owner);
1509  } catch (ReductionRanOutOfSlots&) {
1511  } catch (OverflowOrUnderflow&) {
1512  crt_min_byte_width <<= 1;
1513  continue;
1514  }
1515  }
1516  return resultsUnion(shared_context, ra_exe_unit);
1517 
1518  } while (static_cast<size_t>(crt_min_byte_width) <= sizeof(int64_t));
1519 
1520  return std::make_shared<ResultSet>(std::vector<TargetInfo>{},
1523  nullptr,
1524  this);
1525 }
bool is_agg(const Analyzer::Expr *expr)
int32_t getErrorCode() const
Definition: ErrorHandling.h:55
std::vector< std::unique_ptr< ExecutionKernel > > createKernels(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, ColumnFetcher &column_fetcher, const std::vector< InputTableInfo > &table_infos, const ExecutionOptions &eo, const bool is_agg, const bool allow_single_frag_table_opt, const size_t context_count, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, RenderInfo *render_info, std::unordered_set< int > &available_gpus, int &available_cpus)
Definition: Execute.cpp:1939
int64_t compilation_queue_time_ms_
Definition: Execute.h:948
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:989
std::unordered_set< int > get_available_gpus(const Catalog_Namespace::Catalog &cat)
Definition: Execute.cpp:996
std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > addDeletedColumn(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
Definition: Execute.cpp:3195
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
ExecutorOptLevel opt_level
size_t compute_buffer_entry_guess(const std::vector< InputTableInfo > &query_infos)
Definition: Execute.cpp:1020
int8_t get_min_byte_width()
ResultSetPtr collectAllDeviceResults(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
Definition: Execute.cpp:1750
size_t get_context_coun