OmniSciDB  21ac014ffc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Executor Class Reference

#include <Execute.h>

+ Collaboration diagram for Executor:

Classes

struct  ExecutorMutexHolder
 
class  FetchCacheAnchor
 
struct  GroupColLLVMValue
 
struct  JoinHashTableOrError
 

Public Types

using ExecutorId = size_t
 
using CachedCardinality = std::pair< bool, size_t >
 

Public Member Functions

 Executor (const ExecutorId id, Data_Namespace::DataMgr *data_mgr, const size_t block_size_x, const size_t grid_size_x, const size_t max_gpu_slab_size, const std::string &debug_dir, const std::string &debug_file)
 
const TemporaryTablesgetTemporaryTables ()
 
StringDictionaryProxygetStringDictionaryProxy (const int dict_id, const bool with_generation) const
 
StringDictionaryProxygetStringDictionaryProxy (const int dictId, const std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const
 
bool isCPUOnly () const
 
bool isArchMaxwell (const ExecutorDeviceType dt) const
 
bool containsLeftDeepOuterJoin () const
 
const ColumnDescriptorgetColumnDescriptor (const Analyzer::ColumnVar *) const
 
const ColumnDescriptorgetPhysicalColumnDescriptor (const Analyzer::ColumnVar *, int) const
 
const Catalog_Namespace::CataloggetCatalog () const
 
void setCatalog (const Catalog_Namespace::Catalog *catalog)
 
Data_Namespace::DataMgrgetDataMgr () const
 
const std::shared_ptr
< RowSetMemoryOwner
getRowSetMemoryOwner () const
 
const TemporaryTablesgetTemporaryTables () const
 
Fragmenter_Namespace::TableInfo getTableInfo (const int table_id) const
 
const TableGenerationgetTableGeneration (const int table_id) const
 
ExpressionRange getColRange (const PhysicalInput &) const
 
size_t getNumBytesForFetchedRow (const std::set< int > &table_ids_to_fetch) const
 
std::vector< ColumnLazyFetchInfogetColLazyFetchInfo (const std::vector< Analyzer::Expr * > &target_exprs) const
 
void registerActiveModule (void *module, const int device_id) const
 
void unregisterActiveModule (void *module, const int device_id) const
 
void interrupt (const QuerySessionId &query_session="", const QuerySessionId &interrupt_session="")
 
void resetInterrupt ()
 
void enableRuntimeQueryInterrupt (const double runtime_query_check_freq, const unsigned pending_query_check_freq) const
 
int8_t warpSize () const
 
unsigned gridSize () const
 
unsigned numBlocksPerMP () const
 
unsigned blockSize () const
 
size_t maxGpuSlabSize () const
 
ResultSetPtr executeWorkUnit (size_t &max_groups_buffer_entry_guess, const bool is_agg, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
TableUpdateMetadata executeUpdate (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const TableDescriptor *updated_table_desc, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const UpdateLogForFragment::Callback &cb, const bool is_agg)
 
void addTransientStringLiterals (const RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< RowSetMemoryOwner > &row_set_mem_owner)
 
void setupCaching (const std::unordered_set< PhysicalInput > &phys_inputs, const std::unordered_set< int > &phys_table_ids)
 
void setColRangeCache (const AggregatedColRange &aggregated_col_range)
 
QuerySessionIdgetCurrentQuerySession (mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
size_t getRunningExecutorId (mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
void setCurrentQuerySession (const QuerySessionId &query_session, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
void setRunningExecutorId (const size_t id, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool checkCurrentQuerySession (const std::string &candidate_query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
void invalidateRunningQuerySession (mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool addToQuerySessionList (const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool removeFromQuerySessionList (const QuerySessionId &query_session, const std::string &submitted_time_str, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
void setQuerySessionAsInterrupted (const QuerySessionId &query_session, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
void resetQuerySessionInterruptFlag (const std::string &query_session, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool checkIsQuerySessionInterrupted (const std::string &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
bool checkIsRunningQuerySessionInterrupted ()
 
bool checkIsQuerySessionEnrolled (const QuerySessionId &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
bool updateQuerySessionStatusWithLock (const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus updated_query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool updateQuerySessionExecutorAssignment (const QuerySessionId &query_session, const std::string &submitted_time_str, const size_t executor_id, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
std::vector< QuerySessionStatusgetQuerySessionInfo (const QuerySessionId &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
mapd_shared_mutexgetSessionLock ()
 
CurrentQueryStatus attachExecutorToQuerySession (const QuerySessionId &query_session_id, const std::string &query_str, const std::string &query_submitted_time)
 
void checkPendingQueryStatus (const QuerySessionId &query_session)
 
void clearQuerySessionStatus (const QuerySessionId &query_session, const std::string &submitted_time_str, const bool acquire_spin_lock)
 
void updateQuerySessionStatus (std::shared_ptr< const query_state::QueryState > &query_state, const QuerySessionStatus::QueryStatus new_query_status)
 
void updateQuerySessionStatus (const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus new_query_status)
 
void enrollQuerySession (const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted_time_str, const size_t executor_id, const QuerySessionStatus::QueryStatus query_session_status)
 
void addToCardinalityCache (const std::string &cache_key, const size_t cache_value)
 
CachedCardinality getCachedCardinality (const std::string &cache_key)
 
mapd_shared_mutexgetDataRecyclerLock ()
 
QueryPlanDagCachegetQueryPlanDagCache ()
 
JoinColumnsInfo getJoinColumnsInfo (const Analyzer::Expr *join_expr, JoinColumnSide target_side, bool extract_only_col_id)
 
template<typename THREAD_POOL >
void launchKernels (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels)
 

Static Public Member Functions

static std::shared_ptr< ExecutorgetExecutor (const ExecutorId id, const std::string &debug_dir="", const std::string &debug_file="", const SystemParameters &system_parameters=SystemParameters())
 
static void nukeCacheOfExecutors ()
 
static void clearMemory (const Data_Namespace::MemoryLevel memory_level)
 
static size_t getArenaBlockSize ()
 
static void addUdfIrToModule (const std::string &udf_ir_filename, const bool is_cuda_ir)
 
static std::pair< int64_t,
int32_t > 
reduceResults (const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)
 
static void addCodeToCache (const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
 

Static Public Attributes

static const ExecutorId UNITARY_EXECUTOR_ID = 0
 
static const size_t high_scan_limit
 
static const int32_t ERR_DIV_BY_ZERO {1}
 
static const int32_t ERR_OUT_OF_GPU_MEM {2}
 
static const int32_t ERR_OUT_OF_SLOTS {3}
 
static const int32_t ERR_UNSUPPORTED_SELF_JOIN {4}
 
static const int32_t ERR_OUT_OF_RENDER_MEM {5}
 
static const int32_t ERR_OUT_OF_CPU_MEM {6}
 
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW {7}
 
static const int32_t ERR_OUT_OF_TIME {9}
 
static const int32_t ERR_INTERRUPTED {10}
 
static const int32_t ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED {11}
 
static const int32_t ERR_TOO_MANY_LITERALS {12}
 
static const int32_t ERR_STRING_CONST_IN_RESULTSET {13}
 
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY {14}
 
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES {15}
 
static const int32_t ERR_GEOS {16}
 
static std::mutex compilation_mutex_
 
static std::mutex kernel_mutex_
 

Private Types

using PerFragmentCallBack = std::function< void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo &)>
 

Private Member Functions

void clearMetaInfoCache ()
 
int deviceCount (const ExecutorDeviceType) const
 
int deviceCountForMemoryLevel (const Data_Namespace::MemoryLevel memory_level) const
 
llvm::Value * codegenWindowFunction (const size_t target_index, const CompilationOptions &co)
 
llvm::Value * codegenWindowFunctionAggregate (const CompilationOptions &co)
 
llvm::BasicBlock * codegenWindowResetStateControlFlow ()
 
void codegenWindowFunctionStateInit (llvm::Value *aggregate_state)
 
llvm::Value * codegenWindowFunctionAggregateCalls (llvm::Value *aggregate_state, const CompilationOptions &co)
 
void codegenWindowAvgEpilogue (llvm::Value *crt_val, llvm::Value *window_func_null_val, llvm::Value *multiplicity_lv)
 
llvm::Value * codegenAggregateWindowState ()
 
llvm::Value * aggregateWindowStatePtr ()
 
CudaMgr_Namespace::CudaMgrcudaMgr () const
 
bool isArchPascalOrLater (const ExecutorDeviceType dt) const
 
bool needFetchAllFragments (const InputColDescriptor &col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments) const
 
bool needLinearizeAllFragments (const ColumnDescriptor *cd, const InputColDescriptor &inner_col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments, const Data_Namespace::MemoryLevel memory_level) const
 
void executeWorkUnitPerFragment (const RelAlgExecutionUnit &ra_exe_unit, const InputTableInfo &table_info, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, PerFragmentCallBack &cb, const std::set< size_t > &fragment_indexes_param)
 Compiles and dispatches a work unit per fragment processing results with the per fragment callback. Currently used for computing metrics over fragments (metadata). More...
 
ResultSetPtr executeExplain (const QueryCompilationDescriptor &)
 
ResultSetPtr executeTableFunction (const TableFunctionExecutionUnit exe_unit, const std::vector< InputTableInfo > &table_infos, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat)
 Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps. More...
 
ExecutorDeviceType getDeviceTypeForTargets (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType requested_device_type)
 
ResultSetPtr collectAllDeviceResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
 
ResultSetPtr collectAllDeviceShardedTopResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit) const
 
std::unordered_map< int, const
Analyzer::BinOper * > 
getInnerTabIdToJoinCond () const
 
std::vector< std::unique_ptr
< ExecutionKernel > > 
createKernels (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, ColumnFetcher &column_fetcher, const std::vector< InputTableInfo > &table_infos, const ExecutionOptions &eo, const bool is_agg, const bool allow_single_frag_table_opt, const size_t context_count, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, RenderInfo *render_info, std::unordered_set< int > &available_gpus, int &available_cpus)
 
template<typename THREAD_POOL >
void launchKernels (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels)
 
std::vector< size_t > getTableFragmentIndices (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type, const size_t table_idx, const size_t outer_frag_idx, std::map< int, const TableFragments * > &selected_tables_fragments, const std::unordered_map< int, const Analyzer::BinOper * > &inner_table_id_to_join_condition)
 
bool skipFragmentPair (const Fragmenter_Namespace::FragmentInfo &outer_fragment_info, const Fragmenter_Namespace::FragmentInfo &inner_fragment_info, const int inner_table_id, const std::unordered_map< int, const Analyzer::BinOper * > &inner_table_id_to_join_condition, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
 
FetchResult fetchChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< int, const TableFragments * > &, const FragmentsList &selected_fragments, const Catalog_Namespace::Catalog &, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)
 
FetchResult fetchUnionChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< int, const TableFragments * > &, const FragmentsList &selected_fragments, const Catalog_Namespace::Catalog &, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)
 
std::pair< std::vector
< std::vector< int64_t >
>, std::vector< std::vector
< uint64_t > > > 
getRowCountAndOffsetForAllFrags (const RelAlgExecutionUnit &ra_exe_unit, const CartesianProduct< std::vector< std::vector< size_t >>> &frag_ids_crossjoin, const std::vector< InputDescriptor > &input_descs, const std::map< int, const TableFragments * > &all_tables_fragments)
 
void buildSelectedFragsMapping (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
void buildSelectedFragsMappingForUnion (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< size_t > getFragmentCount (const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
 
int32_t executePlanWithGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const int outer_table_id, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info)
 
int32_t executePlanWithoutGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const std::vector< Analyzer::Expr * > &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, QueryExecutionContext *query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info)
 
ResultSetPtr resultsUnion (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< int64_t > getJoinHashTablePtrs (const ExecutorDeviceType device_type, const int device_id)
 
ResultSetPtr reduceMultiDeviceResults (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceMultiDeviceResultSets (std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceSpeculativeTopN (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr executeWorkUnitImpl (size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
std::vector< llvm::Value * > inlineHoistedLiterals ()
 
std::tuple< CompilationResult,
std::unique_ptr
< QueryMemoryDescriptor > > 
compileWorkUnit (const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
 
llvm::BasicBlock * codegenSkipDeletedOuterTableRow (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
std::vector< JoinLoopbuildJoinLoops (RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
 
JoinLoop::HoistedFiltersCallback buildHoistLeftHandSideFiltersCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const int inner_table_id, const CompilationOptions &co)
 
std::function< llvm::Value
*(const std::vector
< llvm::Value * >
&, llvm::Value *)> 
buildIsDeletedCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
 
std::shared_ptr< HashJoinbuildCurrentLevelHashTable (const JoinCondition &current_level_join_conditions, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
 
void redeclareFilterFunction ()
 
llvm::Value * addJoinLoopIterator (const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
 
void codegenJoinLoops (const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
 
bool compileBody (const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
 
void createErrorCheckControlFlow (llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
 
void insertErrorCodeChecker (llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
 
void preloadFragOffsets (const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
 
JoinHashTableOrError buildHashTableForQualifier (const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, ColumnCacheMap &column_cache, const RegisteredQueryHint &query_hint)
 
void nukeOldState (const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
 
std::shared_ptr
< CompilationContext
optimizeAndCodegenCPU (llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
 
std::shared_ptr
< CompilationContext
optimizeAndCodegenGPU (llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
 
std::string generatePTX (const std::string &) const
 
void initializeNVPTXBackend () const
 
int64_t deviceCycles (int milliseconds) const
 
GroupColLLVMValue groupByColumnCodegen (Analyzer::Expr *group_by_col, const size_t col_width, const CompilationOptions &, const bool translate_null_val, const int64_t translated_null_val, DiamondCodegen &, std::stack< llvm::BasicBlock * > &, const bool thread_mem_shared)
 
llvm::Value * castToFP (llvm::Value *, SQLTypeInfo const &from_ti, SQLTypeInfo const &to_ti)
 
llvm::Value * castToIntPtrTyIn (llvm::Value *val, const size_t bit_width)
 
std::tuple
< RelAlgExecutionUnit,
PlanState::DeletedColumnsMap
addDeletedColumn (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
bool isFragmentFullyDeleted (const int table_id, const Fragmenter_Namespace::FragmentInfo &fragment)
 
std::pair< bool, int64_t > skipFragment (const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &frag_info, const std::list< std::shared_ptr< Analyzer::Expr >> &simple_quals, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
std::pair< bool, int64_t > skipFragmentInnerJoins (const InputDescriptor &table_desc, const RelAlgExecutionUnit &ra_exe_unit, const Fragmenter_Namespace::FragmentInfo &fragment, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
AggregatedColRange computeColRangesCache (const std::unordered_set< PhysicalInput > &phys_inputs)
 
StringDictionaryGenerations computeStringDictionaryGenerations (const std::unordered_set< PhysicalInput > &phys_inputs)
 
TableGenerations computeTableGenerations (std::unordered_set< int > phys_table_ids)
 
std::shared_ptr
< CompilationContext
getCodeFromCache (const CodeCacheKey &, const CodeCache &)
 
std::vector< int8_t > serializeLiterals (const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
 
llvm::Value * spillDoubleElement (llvm::Value *elem_val, llvm::Type *elem_ty)
 
ExecutorMutexHolder acquireExecuteMutex ()
 

Static Private Member Functions

static size_t align (const size_t off_in, const size_t alignment)
 

Private Attributes

std::unique_ptr< CgenStatecgen_state_
 
std::unique_ptr< PlanStateplan_state_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::mutex gpu_exec_mutex_ [max_gpu_count]
 
std::mutex str_dict_mutex_
 
std::unique_ptr
< llvm::TargetMachine > 
nvptx_target_machine_
 
CodeCache cpu_code_cache_
 
CodeCache gpu_code_cache_
 
const unsigned block_size_x_
 
const unsigned grid_size_x_
 
const size_t max_gpu_slab_size_
 
const std::string debug_dir_
 
const std::string debug_file_
 
const ExecutorId executor_id_
 
const Catalog_Namespace::Catalogcatalog_
 
Data_Namespace::DataMgrdata_mgr_
 
const TemporaryTablestemporary_tables_
 
int64_t kernel_queue_time_ms_ = 0
 
int64_t compilation_queue_time_ms_ = 0
 
std::unique_ptr
< WindowProjectNodeContext
window_project_node_context_owned_
 
WindowFunctionContextactive_window_function_ {nullptr}
 
InputTableInfoCache input_table_info_cache_
 
AggregatedColRange agg_col_range_cache_
 
TableGenerations table_generations_
 
const QueryPlanHash INVALID_QUERY_PLAN_HASH {std::hash<std::string>{}(EMPTY_QUERY_PLAN)}
 

Static Private Attributes

static const int max_gpu_count {16}
 
static std::mutex gpu_active_modules_mutex_
 
static uint32_t gpu_active_modules_device_mask_ {0x0}
 
static void * gpu_active_modules_ [max_gpu_count]
 
static std::atomic< bool > interrupted_ {false}
 
static const size_t baseline_threshold
 
static const size_t code_cache_size {1000}
 
static mapd_shared_mutex executor_session_mutex_
 
static QuerySessionId current_query_session_ {""}
 
static size_t running_query_executor_id_ {0}
 
static InterruptFlagMap queries_interrupt_flag_
 
static QuerySessionMap queries_session_map_
 
static std::map< int,
std::shared_ptr< Executor > > 
executors_
 
static std::atomic_flag execute_spin_lock_ = ATOMIC_FLAG_INIT
 
static mapd_shared_mutex execute_mutex_
 
static mapd_shared_mutex executors_cache_mutex_
 
static QueryPlanDagCache query_plan_dag_cache_
 
static mapd_shared_mutex recycler_mutex_
 
static std::unordered_map
< std::string, size_t > 
cardinality_cache_
 

Friends

class BaselineJoinHashTable
 
class CodeGenerator
 
class ColumnFetcher
 
struct DiamondCodegen
 
class ExecutionKernel
 
class HashJoin
 
class OverlapsJoinHashTable
 
class GroupByAndAggregate
 
class QueryCompilationDescriptor
 
class QueryMemoryDescriptor
 
class QueryMemoryInitializer
 
class QueryFragmentDescriptor
 
class QueryExecutionContext
 
class ResultSet
 
class InValuesBitmap
 
class LeafAggregator
 
class PerfectJoinHashTable
 
class QueryRewriter
 
class PendingExecutionClosure
 
class RelAlgExecutor
 
class TableOptimizer
 
class TableFunctionCompilationContext
 
class TableFunctionExecutionContext
 
struct TargetExprCodegenBuilder
 
struct TargetExprCodegen
 
class WindowProjectNodeContext
 

Detailed Description

Definition at line 359 of file Execute.h.

Member Typedef Documentation

using Executor::CachedCardinality = std::pair<bool, size_t>

Definition at line 1008 of file Execute.h.

using Executor::ExecutorId = size_t

Definition at line 366 of file Execute.h.

Definition at line 555 of file Execute.h.

Constructor & Destructor Documentation

Executor::Executor ( const ExecutorId  id,
Data_Namespace::DataMgr data_mgr,
const size_t  block_size_x,
const size_t  grid_size_x,
const size_t  max_gpu_slab_size,
const std::string &  debug_dir,
const std::string &  debug_file 
)

Definition at line 146 of file Execute.cpp.

153  : cgen_state_(new CgenState({}, false))
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034

Member Function Documentation

ExecutorMutexHolder Executor::acquireExecuteMutex ( )
inlineprivate

Definition at line 1115 of file Execute.h.

References execute_mutex_, executor_id_, Executor::ExecutorMutexHolder::shared_lock, Executor::ExecutorMutexHolder::unique_lock, and UNITARY_EXECUTOR_ID.

1115  {
1116  ExecutorMutexHolder ret;
1118  // Only one unitary executor can run at a time
1119  ret.unique_lock = mapd_unique_lock<mapd_shared_mutex>(execute_mutex_);
1120  } else {
1121  ret.shared_lock = mapd_shared_lock<mapd_shared_mutex>(execute_mutex_);
1122  }
1123  return ret;
1124  }
static mapd_shared_mutex execute_mutex_
Definition: Execute.h:1109
const ExecutorId executor_id_
Definition: Execute.h:1077
static const ExecutorId UNITARY_EXECUTOR_ID
Definition: Execute.h:367
void Executor::addCodeToCache ( const CodeCacheKey key,
std::shared_ptr< CompilationContext compilation_context,
llvm::Module *  module,
CodeCache cache 
)
static

Definition at line 385 of file NativeCodegen.cpp.

References LruCache< key_t, value_t, hash_t >::put().

Referenced by StubGenerator::generateStub().

388  {
389  cache.put(key,
390  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
391  std::move(compilation_context), std::move(module)));
392 }
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > Executor::addDeletedColumn ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 3485 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::add_deleted_col_to_map(), catalog_(), CHECK, CompilationOptions::filter_on_deleted_column, and TABLE.

3487  {
3488  if (!co.filter_on_deleted_column) {
3489  return std::make_tuple(ra_exe_unit, PlanState::DeletedColumnsMap{});
3490  }
3491  auto ra_exe_unit_with_deleted = ra_exe_unit;
3492  PlanState::DeletedColumnsMap deleted_cols_map;
3493  for (const auto& input_table : ra_exe_unit_with_deleted.input_descs) {
3494  if (input_table.getSourceType() != InputSourceType::TABLE) {
3495  continue;
3496  }
3497  const auto td = catalog_->getMetadataForTable(input_table.getTableId());
3498  CHECK(td);
3499  const auto deleted_cd = catalog_->getDeletedColumnIfRowsDeleted(td);
3500  if (!deleted_cd) {
3501  continue;
3502  }
3503  CHECK(deleted_cd->columnType.is_boolean());
3504  // check deleted column is not already present
3505  bool found = false;
3506  for (const auto& input_col : ra_exe_unit_with_deleted.input_col_descs) {
3507  if (input_col.get()->getColId() == deleted_cd->columnId &&
3508  input_col.get()->getScanDesc().getTableId() == deleted_cd->tableId &&
3509  input_col.get()->getScanDesc().getNestLevel() == input_table.getNestLevel()) {
3510  found = true;
3511  add_deleted_col_to_map(deleted_cols_map, deleted_cd);
3512  break;
3513  }
3514  }
3515  if (!found) {
3516  // add deleted column
3517  ra_exe_unit_with_deleted.input_col_descs.emplace_back(new InputColDescriptor(
3518  deleted_cd->columnId, deleted_cd->tableId, input_table.getNestLevel()));
3519  add_deleted_col_to_map(deleted_cols_map, deleted_cd);
3520  }
3521  }
3522  return std::make_tuple(ra_exe_unit_with_deleted, deleted_cols_map);
3523 }
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1078
const ColumnDescriptor * getDeletedColumnIfRowsDeleted(const TableDescriptor *td) const
Definition: Catalog.cpp:3192
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
#define CHECK(condition)
Definition: Logger.h:206
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
void add_deleted_col_to_map(PlanState::DeletedColumnsMap &deleted_cols_map, const ColumnDescriptor *deleted_cd)
Definition: Execute.cpp:3473

+ Here is the call graph for this function:

llvm::Value * Executor::addJoinLoopIterator ( const std::vector< llvm::Value * > &  prev_iters,
const size_t  level_idx 
)
private

Definition at line 824 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, and CgenState::scan_idx_to_hash_pos_.

825  {
827  // Iterators are added for loop-outer joins when the head of the loop is generated,
828  // then once again when the body if generated. Allow this instead of special handling
829  // of call sites.
830  const auto it = cgen_state_->scan_idx_to_hash_pos_.find(level_idx);
831  if (it != cgen_state_->scan_idx_to_hash_pos_.end()) {
832  return it->second;
833  }
834  CHECK(!prev_iters.empty());
835  llvm::Value* matching_row_index = prev_iters.back();
836  const auto it_ok =
837  cgen_state_->scan_idx_to_hash_pos_.emplace(level_idx, matching_row_index);
838  CHECK(it_ok.second);
839  return matching_row_index;
840 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:206
void Executor::addToCardinalityCache ( const std::string &  cache_key,
const size_t  cache_value 
)

Definition at line 4233 of file Execute.cpp.

References g_use_estimator_result_cache, and VLOG.

4234  {
4236  mapd_unique_lock<mapd_shared_mutex> lock(recycler_mutex_);
4237  cardinality_cache_[cache_key] = cache_value;
4238  VLOG(1) << "Put estimated cardinality to the cache";
4239  }
4240 }
static std::unordered_map< std::string, size_t > cardinality_cache_
Definition: Execute.h:1131
bool g_use_estimator_result_cache
Definition: Execute.cpp:116
static mapd_shared_mutex recycler_mutex_
Definition: Execute.h:1130
#define VLOG(n)
Definition: Logger.h:300
bool Executor::addToQuerySessionList ( const QuerySessionId query_session,
const std::string &  query_str,
const std::string &  submitted,
const size_t  executor_id,
const QuerySessionStatus::QueryStatus  query_status,
mapd_unique_lock< mapd_shared_mutex > &  write_lock 
)

Definition at line 4050 of file Execute.cpp.

4055  {
4056  // an internal API that enrolls the query session into the Executor's session map
4057  if (queries_session_map_.count(query_session)) {
4058  if (queries_session_map_.at(query_session).count(submitted_time_str)) {
4059  queries_session_map_.at(query_session).erase(submitted_time_str);
4060  queries_session_map_.at(query_session)
4061  .emplace(submitted_time_str,
4062  QuerySessionStatus(query_session,
4063  executor_id,
4064  query_str,
4065  submitted_time_str,
4066  query_status));
4067  } else {
4068  queries_session_map_.at(query_session)
4069  .emplace(submitted_time_str,
4070  QuerySessionStatus(query_session,
4071  executor_id,
4072  query_str,
4073  submitted_time_str,
4074  query_status));
4075  }
4076  } else {
4077  std::map<std::string, QuerySessionStatus> executor_per_query_map;
4078  executor_per_query_map.emplace(
4079  submitted_time_str,
4081  query_session, executor_id, query_str, submitted_time_str, query_status));
4082  queries_session_map_.emplace(query_session, executor_per_query_map);
4083  }
4084  return queries_interrupt_flag_.emplace(query_session, false).second;
4085 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1102
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1100
void Executor::addTransientStringLiterals ( const RelAlgExecutionUnit ra_exe_unit,
const std::shared_ptr< RowSetMemoryOwner > &  row_set_mem_owner 
)

Definition at line 1726 of file Execute.cpp.

References CHECK, RelAlgExecutionUnit::groupby_exprs, kENCODING_DICT, kSAMPLE, kSINGLE_VALUE, RelAlgExecutionUnit::quals, RelAlgExecutionUnit::simple_quals, RelAlgExecutionUnit::target_exprs, and ScalarExprVisitor< T >::visit().

1728  {
1729  TransientDictIdVisitor dict_id_visitor;
1730 
1731  auto visit_expr =
1732  [this, &dict_id_visitor, &row_set_mem_owner](const Analyzer::Expr* expr) {
1733  if (!expr) {
1734  return;
1735  }
1736  const auto dict_id = dict_id_visitor.visit(expr);
1737  if (dict_id >= 0) {
1738  auto sdp = getStringDictionaryProxy(dict_id, row_set_mem_owner, true);
1739  CHECK(sdp);
1740  TransientStringLiteralsVisitor visitor(sdp);
1741  visitor.visit(expr);
1742  }
1743  };
1744 
1745  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
1746  visit_expr(group_expr.get());
1747  }
1748 
1749  for (const auto& group_expr : ra_exe_unit.quals) {
1750  visit_expr(group_expr.get());
1751  }
1752 
1753  for (const auto& group_expr : ra_exe_unit.simple_quals) {
1754  visit_expr(group_expr.get());
1755  }
1756 
1757  for (const auto target_expr : ra_exe_unit.target_exprs) {
1758  const auto& target_type = target_expr->get_type_info();
1759  if (target_type.is_string() && target_type.get_compression() != kENCODING_DICT) {
1760  continue;
1761  }
1762  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1763  if (agg_expr) {
1764  if (agg_expr->get_aggtype() == kSINGLE_VALUE ||
1765  agg_expr->get_aggtype() == kSAMPLE) {
1766  visit_expr(agg_expr->get_arg());
1767  }
1768  } else {
1769  visit_expr(target_expr);
1770  }
1771  }
1772 }
std::vector< Analyzer::Expr * > target_exprs
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
StringDictionaryProxy * getStringDictionaryProxy(const int dict_id, const bool with_generation) const
Definition: Execute.h:404
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:206
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

void Executor::addUdfIrToModule ( const std::string &  udf_ir_filename,
const bool  is_cuda_ir 
)
static

Definition at line 1771 of file NativeCodegen.cpp.

Referenced by DBHandler::initialize().

1772  {
1773  if (is_cuda_ir) {
1774  read_udf_gpu_module(udf_ir_filename);
1775  } else {
1776  read_udf_cpu_module(udf_ir_filename);
1777  }
1778 }
void read_udf_cpu_module(const std::string &udf_ir_filename)
void read_udf_gpu_module(const std::string &udf_ir_filename)

+ Here is the caller graph for this function:

llvm::Value * Executor::aggregateWindowStatePtr ( )
private

Definition at line 124 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), and kFLOAT.

124  {
126  const auto window_func_context =
128  const auto window_func = window_func_context->getWindowFunction();
129  const auto arg_ti = get_adjusted_window_type_info(window_func);
130  llvm::Type* aggregate_state_type =
131  arg_ti.get_type() == kFLOAT
132  ? llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0)
133  : llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
134  const auto aggregate_state_i64 = cgen_state_->llInt(
135  reinterpret_cast<const int64_t>(window_func_context->aggregateState()));
136  return cgen_state_->ir_builder_.CreateIntToPtr(aggregate_state_i64,
137  aggregate_state_type);
138 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

static size_t Executor::align ( const size_t  off_in,
const size_t  alignment 
)
inlinestaticprivate

Definition at line 1026 of file Execute.h.

1026  {
1027  size_t off = off_in;
1028  if (off % alignment != 0) {
1029  off += (alignment - off % alignment);
1030  }
1031  return off;
1032  }
CurrentQueryStatus Executor::attachExecutorToQuerySession ( const QuerySessionId query_session_id,
const std::string &  query_str,
const std::string &  query_submitted_time 
)

Definition at line 3920 of file Execute.cpp.

References executor_id_().

3923  {
3924  if (!query_session_id.empty()) {
3925  // if session is valid, do update 1) the exact executor id and 2) query status
3926  mapd_unique_lock<mapd_shared_mutex> write_lock(executor_session_mutex_);
3928  query_session_id, query_submitted_time, executor_id_, write_lock);
3929  updateQuerySessionStatusWithLock(query_session_id,
3930  query_submitted_time,
3931  QuerySessionStatus::QueryStatus::PENDING_EXECUTOR,
3932  write_lock);
3933  }
3934  return {query_session_id, query_str};
3935 }
bool updateQuerySessionStatusWithLock(const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus updated_query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:4087
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1094
const ExecutorId executor_id_
Definition: Execute.h:1077
bool updateQuerySessionExecutorAssignment(const QuerySessionId &query_session, const std::string &submitted_time_str, const size_t executor_id, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:4113
mapd_unique_lock< mapd_shared_mutex > write_lock

+ Here is the call graph for this function:

unsigned Executor::blockSize ( ) const

Definition at line 3393 of file Execute.cpp.

References block_size_x_(), CHECK, and data_mgr_().

3393  {
3394  CHECK(data_mgr_);
3395  const auto cuda_mgr = data_mgr_->getCudaMgr();
3396  if (!cuda_mgr) {
3397  return 0;
3398  }
3399  const auto& dev_props = cuda_mgr->getAllDeviceProperties();
3400  return block_size_x_ ? block_size_x_ : dev_props.front().maxThreadsPerBlock;
3401 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:208
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1079
const unsigned block_size_x_
Definition: Execute.h:1071
#define CHECK(condition)
Definition: Logger.h:206
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:120

+ Here is the call graph for this function:

std::shared_ptr< HashJoin > Executor::buildCurrentLevelHashTable ( const JoinCondition current_level_join_conditions,
RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache,
std::vector< std::string > &  fail_reasons 
)
private

Definition at line 669 of file IRCodegen.cpp.

References anonymous_namespace{IRCodegen.cpp}::add_qualifier_to_execution_unit(), ANTI, AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, anonymous_namespace{IRCodegen.cpp}::check_valid_join_qual(), Data_Namespace::CPU_LEVEL, CompilationOptions::device_type, Executor::JoinHashTableOrError::fail_reason, GPU, Data_Namespace::GPU_LEVEL, Executor::JoinHashTableOrError::hash_table, INNER, IS_EQUIVALENCE, PlanState::join_info_, OneToOne, CodeGenerator::plan_state_, JoinCondition::quals, RelAlgExecutionUnit::query_hint, SEMI, and JoinCondition::type.

675  {
677  if (current_level_join_conditions.type != JoinType::INNER &&
678  current_level_join_conditions.type != JoinType::SEMI &&
679  current_level_join_conditions.type != JoinType::ANTI &&
680  current_level_join_conditions.quals.size() > 1) {
681  fail_reasons.emplace_back("No equijoin expression found for outer join");
682  return nullptr;
683  }
684  std::shared_ptr<HashJoin> current_level_hash_table;
685  for (const auto& join_qual : current_level_join_conditions.quals) {
686  auto qual_bin_oper = std::dynamic_pointer_cast<Analyzer::BinOper>(join_qual);
687  if (!qual_bin_oper || !IS_EQUIVALENCE(qual_bin_oper->get_optype())) {
688  fail_reasons.emplace_back("No equijoin expression found");
689  if (current_level_join_conditions.type == JoinType::INNER ||
690  current_level_join_conditions.type == JoinType::SEMI ||
691  current_level_join_conditions.type == JoinType::ANTI) {
692  add_qualifier_to_execution_unit(ra_exe_unit, join_qual);
693  }
694  continue;
695  }
696  check_valid_join_qual(qual_bin_oper);
697  JoinHashTableOrError hash_table_or_error;
698  if (!current_level_hash_table) {
699  hash_table_or_error = buildHashTableForQualifier(
700  qual_bin_oper,
701  query_infos,
704  current_level_join_conditions.type,
706  column_cache,
707  ra_exe_unit.query_hint);
708  current_level_hash_table = hash_table_or_error.hash_table;
709  }
710  if (hash_table_or_error.hash_table) {
711  plan_state_->join_info_.join_hash_tables_.push_back(hash_table_or_error.hash_table);
712  plan_state_->join_info_.equi_join_tautologies_.push_back(qual_bin_oper);
713  } else {
714  fail_reasons.push_back(hash_table_or_error.fail_reason);
715  if (current_level_join_conditions.type == JoinType::INNER ||
716  current_level_join_conditions.type == JoinType::SEMI ||
717  current_level_join_conditions.type == JoinType::ANTI) {
718  add_qualifier_to_execution_unit(ra_exe_unit, qual_bin_oper);
719  }
720  }
721  }
722  return current_level_hash_table;
723 }
JoinHashTableOrError buildHashTableForQualifier(const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, ColumnCacheMap &column_cache, const RegisteredQueryHint &query_hint)
Definition: Execute.cpp:3341
#define IS_EQUIVALENCE(X)
Definition: sqldefs.h:67
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
void add_qualifier_to_execution_unit(RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< Analyzer::Expr > &qual)
Definition: IRCodegen.cpp:240
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
std::list< std::shared_ptr< Analyzer::Expr > > quals
RegisteredQueryHint query_hint
void check_valid_join_qual(std::shared_ptr< Analyzer::BinOper > &bin_oper)
Definition: IRCodegen.cpp:270

+ Here is the call graph for this function:

Executor::JoinHashTableOrError Executor::buildHashTableForQualifier ( const std::shared_ptr< Analyzer::BinOper > &  qual_bin_oper,
const std::vector< InputTableInfo > &  query_infos,
const MemoryLevel  memory_level,
const JoinType  join_type,
const HashType  preferred_hash_type,
ColumnCacheMap column_cache,
const RegisteredQueryHint query_hint 
)
private

Definition at line 3341 of file Execute.cpp.

References g_enable_dynamic_watchdog, g_enable_overlaps_hashjoin, and HashJoin::getInstance().

3348  {
3349  if (!g_enable_overlaps_hashjoin && qual_bin_oper->is_overlaps_oper()) {
3350  return {nullptr, "Overlaps hash join disabled, attempting to fall back to loop join"};
3351  }
3352  if (g_enable_dynamic_watchdog && interrupted_.load()) {
3354  }
3355  try {
3356  auto tbl = HashJoin::getInstance(qual_bin_oper,
3357  query_infos,
3358  memory_level,
3359  join_type,
3360  preferred_hash_type,
3361  deviceCountForMemoryLevel(memory_level),
3362  column_cache,
3363  this,
3364  query_hint);
3365  return {tbl, ""};
3366  } catch (const HashJoinFail& e) {
3367  return {nullptr, e.what()};
3368  }
3369 }
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1142
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:76
bool g_enable_overlaps_hashjoin
Definition: Execute.cpp:95
static std::shared_ptr< HashJoin > getInstance(const std::shared_ptr< Analyzer::BinOper > qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const RegisteredQueryHint &query_hint)
Make hash table from an in-flight SQL query&#39;s parse tree etc.
Definition: HashJoin.cpp:238
static std::atomic< bool > interrupted_
Definition: Execute.h:1058
int deviceCountForMemoryLevel(const Data_Namespace::MemoryLevel memory_level) const
Definition: Execute.cpp:660

+ Here is the call graph for this function:

JoinLoop::HoistedFiltersCallback Executor::buildHoistLeftHandSideFiltersCb ( const RelAlgExecutionUnit ra_exe_unit,
const size_t  level_idx,
const int  inner_table_id,
const CompilationOptions co 
)
private

Definition at line 502 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CodeGenerator::codegen(), g_enable_left_join_filter_hoisting, PlanState::hoisted_filters_, RelAlgExecutionUnit::join_quals, LEFT, CgenState::llBool(), CodeGenerator::plan_state_, RelAlgExecutionUnit::quals, RelAlgExecutionUnit::simple_quals, CodeGenerator::toBool(), and VLOG.

506  {
508  return nullptr;
509  }
510 
511  const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
512  if (level_idx == 0 && current_level_join_conditions.type == JoinType::LEFT) {
513  const auto& condition = current_level_join_conditions.quals.front();
514  const auto bin_oper = dynamic_cast<const Analyzer::BinOper*>(condition.get());
515  CHECK(bin_oper) << condition->toString();
516  const auto rhs =
517  dynamic_cast<const Analyzer::ColumnVar*>(bin_oper->get_right_operand());
518  const auto lhs =
519  dynamic_cast<const Analyzer::ColumnVar*>(bin_oper->get_left_operand());
520  if (lhs && rhs && lhs->get_table_id() != rhs->get_table_id()) {
521  const Analyzer::ColumnVar* selected_lhs{nullptr};
522  // grab the left hand side column -- this is somewhat similar to normalize column
523  // pair, and a better solution may be to hoist that function out of the join
524  // framework and normalize columns at the top of build join loops
525  if (lhs->get_table_id() == inner_table_id) {
526  selected_lhs = rhs;
527  } else if (rhs->get_table_id() == inner_table_id) {
528  selected_lhs = lhs;
529  }
530  if (selected_lhs) {
531  std::list<std::shared_ptr<Analyzer::Expr>> hoisted_quals;
532  // get all LHS-only filters
533  auto should_hoist_qual = [&hoisted_quals](const auto& qual, const int table_id) {
534  CHECK(qual);
535 
536  ExprTableIdVisitor visitor;
537  const auto table_ids = visitor.visit(qual.get());
538  if (table_ids.size() == 1 && table_ids.find(table_id) != table_ids.end()) {
539  hoisted_quals.push_back(qual);
540  }
541  };
542  for (const auto& qual : ra_exe_unit.simple_quals) {
543  should_hoist_qual(qual, selected_lhs->get_table_id());
544  }
545  for (const auto& qual : ra_exe_unit.quals) {
546  should_hoist_qual(qual, selected_lhs->get_table_id());
547  }
548 
549  // build the filters callback and return it
550  if (!hoisted_quals.empty()) {
551  return [this, hoisted_quals, co](llvm::BasicBlock* true_bb,
552  llvm::BasicBlock* exit_bb,
553  const std::string& loop_name,
554  llvm::Function* parent_func,
555  CgenState* cgen_state) -> llvm::BasicBlock* {
556  // make sure we have quals to hoist
557  bool has_quals_to_hoist = false;
558  for (const auto& qual : hoisted_quals) {
559  // check to see if the filter was previously hoisted. if all filters were
560  // previously hoisted, this callback becomes a noop
561  if (plan_state_->hoisted_filters_.count(qual) == 0) {
562  has_quals_to_hoist = true;
563  break;
564  }
565  }
566 
567  if (!has_quals_to_hoist) {
568  return nullptr;
569  }
570 
571  AUTOMATIC_IR_METADATA(cgen_state);
572 
573  llvm::IRBuilder<>& builder = cgen_state->ir_builder_;
574  auto& context = builder.getContext();
575 
576  const auto filter_bb =
577  llvm::BasicBlock::Create(context,
578  "hoisted_left_join_filters_" + loop_name,
579  parent_func,
580  /*insert_before=*/true_bb);
581  builder.SetInsertPoint(filter_bb);
582 
583  llvm::Value* filter_lv = cgen_state_->llBool(true);
584  CodeGenerator code_generator(this);
586  for (const auto& qual : hoisted_quals) {
587  if (plan_state_->hoisted_filters_.insert(qual).second) {
588  // qual was inserted into the hoisted filters map, which means we have not
589  // seen this qual before. Generate filter.
590  VLOG(1) << "Generating code for hoisted left hand side qualifier "
591  << qual->toString();
592  auto cond = code_generator.toBool(
593  code_generator.codegen(qual.get(), true, co).front());
594  filter_lv = builder.CreateAnd(filter_lv, cond);
595  }
596  }
597  CHECK(filter_lv->getType()->isIntegerTy(1));
598 
599  builder.CreateCondBr(filter_lv, true_bb, exit_bb);
600  return filter_bb;
601  };
602  }
603  }
604  }
605  }
606  return nullptr;
607 }
bool g_enable_left_join_filter_hoisting
Definition: Execute.cpp:93
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:206
#define VLOG(n)
Definition: Logger.h:300
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> Executor::buildIsDeletedCb ( const RelAlgExecutionUnit ra_exe_unit,
const size_t  level_idx,
const CompilationOptions co 
)
private

Definition at line 610 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_LT, CodeGenerator::codegen(), CgenState::context_, CgenState::current_func_, CompilationOptions::filter_on_deleted_column, PlanState::getDeletedColForTable(), RelAlgExecutionUnit::input_descs, CgenState::ir_builder_, CgenState::llBool(), CgenState::llInt(), CodeGenerator::plan_state_, TABLE, and CodeGenerator::toBool().

612  {
614  if (!co.filter_on_deleted_column) {
615  return nullptr;
616  }
617  CHECK_LT(level_idx + 1, ra_exe_unit.input_descs.size());
618  const auto input_desc = ra_exe_unit.input_descs[level_idx + 1];
619  if (input_desc.getSourceType() != InputSourceType::TABLE) {
620  return nullptr;
621  }
622 
623  const auto deleted_cd = plan_state_->getDeletedColForTable(input_desc.getTableId());
624  if (!deleted_cd) {
625  return nullptr;
626  }
627  CHECK(deleted_cd->columnType.is_boolean());
628  const auto deleted_expr = makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
629  input_desc.getTableId(),
630  deleted_cd->columnId,
631  input_desc.getNestLevel());
632  return [this, deleted_expr, level_idx, &co](const std::vector<llvm::Value*>& prev_iters,
633  llvm::Value* have_more_inner_rows) {
634  const auto matching_row_index = addJoinLoopIterator(prev_iters, level_idx + 1);
635  // Avoid fetching the deleted column from a position which is not valid.
636  // An invalid position can be returned by a one to one hash lookup (negative)
637  // or at the end of iteration over a set of matching values.
638  llvm::Value* is_valid_it{nullptr};
639  if (have_more_inner_rows) {
640  is_valid_it = have_more_inner_rows;
641  } else {
642  is_valid_it = cgen_state_->ir_builder_.CreateICmp(
643  llvm::ICmpInst::ICMP_SGE, matching_row_index, cgen_state_->llInt<int64_t>(0));
644  }
645  const auto it_valid_bb = llvm::BasicBlock::Create(
646  cgen_state_->context_, "it_valid", cgen_state_->current_func_);
647  const auto it_not_valid_bb = llvm::BasicBlock::Create(
648  cgen_state_->context_, "it_not_valid", cgen_state_->current_func_);
649  cgen_state_->ir_builder_.CreateCondBr(is_valid_it, it_valid_bb, it_not_valid_bb);
650  const auto row_is_deleted_bb = llvm::BasicBlock::Create(
651  cgen_state_->context_, "row_is_deleted", cgen_state_->current_func_);
652  cgen_state_->ir_builder_.SetInsertPoint(it_valid_bb);
653  CodeGenerator code_generator(this);
654  const auto row_is_deleted = code_generator.toBool(
655  code_generator.codegen(deleted_expr.get(), true, co).front());
656  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
657  cgen_state_->ir_builder_.SetInsertPoint(it_not_valid_bb);
658  const auto row_is_deleted_default = cgen_state_->llBool(false);
659  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
660  cgen_state_->ir_builder_.SetInsertPoint(row_is_deleted_bb);
661  auto row_is_deleted_or_default =
662  cgen_state_->ir_builder_.CreatePHI(row_is_deleted->getType(), 2);
663  row_is_deleted_or_default->addIncoming(row_is_deleted, it_valid_bb);
664  row_is_deleted_or_default->addIncoming(row_is_deleted_default, it_not_valid_bb);
665  return row_is_deleted_or_default;
666  };
667 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:216
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:824
#define CHECK(condition)
Definition: Logger.h:206

+ Here is the call graph for this function:

std::vector< JoinLoop > Executor::buildJoinLoops ( RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache 
)
private

Definition at line 289 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_LT, CodeGenerator::codegen(), INJECT_TIMER, CgenState::ir_builder_, RelAlgExecutionUnit::join_quals, LEFT, CgenState::llBool(), OneToOne, CgenState::outer_join_match_found_per_level_, Set, Singleton, JoinLoopDomain::slot_lookup_result, CodeGenerator::toBool(), JoinCondition::type, and JoinLoopDomain::values_buffer.

294  {
297  std::vector<JoinLoop> join_loops;
298  for (size_t level_idx = 0, current_hash_table_idx = 0;
299  level_idx < ra_exe_unit.join_quals.size();
300  ++level_idx) {
301  const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
302  std::vector<std::string> fail_reasons;
303  const auto build_cur_level_hash_table = [&]() {
304  if (current_level_join_conditions.quals.size() > 1) {
305  const auto first_qual = *current_level_join_conditions.quals.begin();
306  auto qual_bin_oper =
307  std::dynamic_pointer_cast<const Analyzer::BinOper>(first_qual);
308  if (qual_bin_oper && qual_bin_oper->is_overlaps_oper() &&
309  current_level_join_conditions.type == JoinType::LEFT) {
310  JoinCondition join_condition{{first_qual}, current_level_join_conditions.type};
311 
313  join_condition, ra_exe_unit, co, query_infos, column_cache, fail_reasons);
314  }
315  }
316  return buildCurrentLevelHashTable(current_level_join_conditions,
317  ra_exe_unit,
318  co,
319  query_infos,
320  column_cache,
321  fail_reasons);
322  };
323  const auto current_level_hash_table = build_cur_level_hash_table();
324  const auto found_outer_join_matches_cb =
325  [this, level_idx](llvm::Value* found_outer_join_matches) {
326  CHECK_LT(level_idx, cgen_state_->outer_join_match_found_per_level_.size());
327  CHECK(!cgen_state_->outer_join_match_found_per_level_[level_idx]);
328  cgen_state_->outer_join_match_found_per_level_[level_idx] =
329  found_outer_join_matches;
330  };
331  const auto is_deleted_cb = buildIsDeletedCb(ra_exe_unit, level_idx, co);
332  const auto outer_join_condition_multi_quals_cb =
333  [this, level_idx, &co, &current_level_join_conditions](
334  const std::vector<llvm::Value*>& prev_iters) {
335  // The values generated for the match path don't dominate all uses
336  // since on the non-match path nulls are generated. Reset the cache
337  // once the condition is generated to avoid incorrect reuse.
338  FetchCacheAnchor anchor(cgen_state_.get());
339  addJoinLoopIterator(prev_iters, level_idx + 1);
340  llvm::Value* left_join_cond = cgen_state_->llBool(true);
341  CodeGenerator code_generator(this);
342  // Do not want to look at all quals! only 1..N quals (ignore first qual)
343  // Note(jclay): this may need to support cases larger than 2
344  // are there any?
345  if (current_level_join_conditions.quals.size() >= 2) {
346  auto qual_it = std::next(current_level_join_conditions.quals.begin(), 1);
347  for (; qual_it != current_level_join_conditions.quals.end();
348  std::advance(qual_it, 1)) {
349  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
350  left_join_cond,
351  code_generator.toBool(
352  code_generator.codegen(qual_it->get(), true, co).front()));
353  }
354  }
355  return left_join_cond;
356  };
357  if (current_level_hash_table) {
358  const auto hoisted_filters_cb = buildHoistLeftHandSideFiltersCb(
359  ra_exe_unit, level_idx, current_level_hash_table->getInnerTableId(), co);
360  if (current_level_hash_table->getHashType() == HashType::OneToOne) {
361  join_loops.emplace_back(
362  /*kind=*/JoinLoopKind::Singleton,
363  /*type=*/current_level_join_conditions.type,
364  /*iteration_domain_codegen=*/
365  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
366  const std::vector<llvm::Value*>& prev_iters) {
367  addJoinLoopIterator(prev_iters, level_idx);
368  JoinLoopDomain domain{{0}};
369  domain.slot_lookup_result =
370  current_level_hash_table->codegenSlot(co, current_hash_table_idx);
371  return domain;
372  },
373  /*outer_condition_match=*/nullptr,
374  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
375  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
376  : nullptr,
377  /*hoisted_filters=*/hoisted_filters_cb,
378  /*is_deleted=*/is_deleted_cb);
379  } else {
380  join_loops.emplace_back(
381  /*kind=*/JoinLoopKind::Set,
382  /*type=*/current_level_join_conditions.type,
383  /*iteration_domain_codegen=*/
384  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
385  const std::vector<llvm::Value*>& prev_iters) {
386  addJoinLoopIterator(prev_iters, level_idx);
387  JoinLoopDomain domain{{0}};
388  const auto matching_set = current_level_hash_table->codegenMatchingSet(
389  co, current_hash_table_idx);
390  domain.values_buffer = matching_set.elements;
391  domain.element_count = matching_set.count;
392  return domain;
393  },
394  /*outer_condition_match=*/
395  current_level_join_conditions.type == JoinType::LEFT
396  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
397  outer_join_condition_multi_quals_cb)
398  : nullptr,
399  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
400  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
401  : nullptr,
402  /*hoisted_filters=*/hoisted_filters_cb,
403  /*is_deleted=*/is_deleted_cb);
404  }
405  ++current_hash_table_idx;
406  } else {
407  const auto fail_reasons_str = current_level_join_conditions.quals.empty()
408  ? "No equijoin expression found"
409  : boost::algorithm::join(fail_reasons, " | ");
411  ra_exe_unit, eo, query_infos, level_idx, fail_reasons_str);
412  // Callback provided to the `JoinLoop` framework to evaluate the (outer) join
413  // condition.
414  VLOG(1) << "Unable to build hash table, falling back to loop join: "
415  << fail_reasons_str;
416  const auto outer_join_condition_cb =
417  [this, level_idx, &co, &current_level_join_conditions](
418  const std::vector<llvm::Value*>& prev_iters) {
419  // The values generated for the match path don't dominate all uses
420  // since on the non-match path nulls are generated. Reset the cache
421  // once the condition is generated to avoid incorrect reuse.
422  FetchCacheAnchor anchor(cgen_state_.get());
423  addJoinLoopIterator(prev_iters, level_idx + 1);
424  llvm::Value* left_join_cond = cgen_state_->llBool(true);
425  CodeGenerator code_generator(this);
426  for (auto expr : current_level_join_conditions.quals) {
427  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
428  left_join_cond,
429  code_generator.toBool(
430  code_generator.codegen(expr.get(), true, co).front()));
431  }
432  return left_join_cond;
433  };
434  join_loops.emplace_back(
435  /*kind=*/JoinLoopKind::UpperBound,
436  /*type=*/current_level_join_conditions.type,
437  /*iteration_domain_codegen=*/
438  [this, level_idx](const std::vector<llvm::Value*>& prev_iters) {
439  addJoinLoopIterator(prev_iters, level_idx);
440  JoinLoopDomain domain{{0}};
441  const auto rows_per_scan_ptr = cgen_state_->ir_builder_.CreateGEP(
442  get_arg_by_name(cgen_state_->row_func_, "num_rows_per_scan"),
443  cgen_state_->llInt(int32_t(level_idx + 1)));
444  domain.upper_bound = cgen_state_->ir_builder_.CreateLoad(rows_per_scan_ptr,
445  "num_rows_per_scan");
446  return domain;
447  },
448  /*outer_condition_match=*/
449  current_level_join_conditions.type == JoinType::LEFT
450  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
451  outer_join_condition_cb)
452  : nullptr,
453  /*found_outer_matches=*/
454  current_level_join_conditions.type == JoinType::LEFT
455  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
456  : nullptr,
457  /*hoisted_filters=*/nullptr,
458  /*is_deleted=*/is_deleted_cb);
459  }
460  }
461  return join_loops;
462 }
std::shared_ptr< HashJoin > buildCurrentLevelHashTable(const JoinCondition &current_level_join_conditions, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
Definition: IRCodegen.cpp:669
llvm::Value * values_buffer
Definition: JoinLoop.h:48
std::string join(T const &container, std::string const &delim)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
#define INJECT_TIMER(DESC)
Definition: measure.h:93
const JoinQualsPerNestingLevel join_quals
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * slot_lookup_result
Definition: JoinLoop.h:46
#define CHECK_LT(x, y)
Definition: Logger.h:216
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:824
#define CHECK(condition)
Definition: Logger.h:206
void check_if_loop_join_is_allowed(RelAlgExecutionUnit &ra_exe_unit, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, const size_t level_idx, const std::string &fail_reason)
Definition: IRCodegen.cpp:250
JoinLoop::HoistedFiltersCallback buildHoistLeftHandSideFiltersCb(const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const int inner_table_id, const CompilationOptions &co)
Definition: IRCodegen.cpp:502
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:289
std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> buildIsDeletedCb(const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
Definition: IRCodegen.cpp:610
#define VLOG(n)
Definition: Logger.h:300

+ Here is the call graph for this function:

void Executor::buildSelectedFragsMapping ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 2806 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LT, and RelAlgExecutionUnit::input_descs.

2811  {
2812  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
2813  size_t frag_pos{0};
2814  const auto& input_descs = ra_exe_unit.input_descs;
2815  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
2816  const int table_id = input_descs[scan_idx].getTableId();
2817  CHECK_EQ(selected_fragments[scan_idx].table_id, table_id);
2818  selected_fragments_crossjoin.push_back(
2819  getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
2820  for (const auto& col_id : col_global_ids) {
2821  CHECK(col_id);
2822  const auto& input_desc = col_id->getScanDesc();
2823  if (input_desc.getTableId() != table_id ||
2824  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
2825  continue;
2826  }
2827  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
2828  CHECK(it != plan_state_->global_to_local_col_ids_.end());
2829  CHECK_LT(static_cast<size_t>(it->second),
2830  plan_state_->global_to_local_col_ids_.size());
2831  local_col_to_frag_pos[it->second] = frag_pos;
2832  }
2833  ++frag_pos;
2834  }
2835 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
#define CHECK_LT(x, y)
Definition: Logger.h:216
std::vector< size_t > getFragmentCount(const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:2792
#define CHECK(condition)
Definition: Logger.h:206
void Executor::buildSelectedFragsMappingForUnion ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 2837 of file Execute.cpp.

References CHECK, CHECK_LT, and RelAlgExecutionUnit::input_descs.

2842  {
2843  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
2844  size_t frag_pos{0};
2845  const auto& input_descs = ra_exe_unit.input_descs;
2846  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
2847  const int table_id = input_descs[scan_idx].getTableId();
2848  // selected_fragments here is from assignFragsToKernelDispatch
2849  // execution_kernel.fragments
2850  if (selected_fragments[0].table_id != table_id) { // TODO 0
2851  continue;
2852  }
2853  // CHECK_EQ(selected_fragments[scan_idx].table_id, table_id);
2854  selected_fragments_crossjoin.push_back(
2855  // getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
2856  {size_t(1)}); // TODO
2857  for (const auto& col_id : col_global_ids) {
2858  CHECK(col_id);
2859  const auto& input_desc = col_id->getScanDesc();
2860  if (input_desc.getTableId() != table_id ||
2861  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
2862  continue;
2863  }
2864  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
2865  CHECK(it != plan_state_->global_to_local_col_ids_.end());
2866  CHECK_LT(static_cast<size_t>(it->second),
2867  plan_state_->global_to_local_col_ids_.size());
2868  local_col_to_frag_pos[it->second] = frag_pos;
2869  }
2870  ++frag_pos;
2871  }
2872 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
#define CHECK_LT(x, y)
Definition: Logger.h:216
#define CHECK(condition)
Definition: Logger.h:206
llvm::Value * Executor::castToFP ( llvm::Value *  value,
SQLTypeInfo const from_ti,
SQLTypeInfo const to_ti 
)
private

Definition at line 3412 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, exp_to_scale(), logger::FATAL, SQLTypeInfo::get_scale(), SQLTypeInfo::get_size(), SQLTypeInfo::is_fp(), SQLTypeInfo::is_number(), and LOG.

3414  {
3416  if (value->getType()->isIntegerTy() && from_ti.is_number() && to_ti.is_fp() &&
3417  (!from_ti.is_fp() || from_ti.get_size() != to_ti.get_size())) {
3418  llvm::Type* fp_type{nullptr};
3419  switch (to_ti.get_size()) {
3420  case 4:
3421  fp_type = llvm::Type::getFloatTy(cgen_state_->context_);
3422  break;
3423  case 8:
3424  fp_type = llvm::Type::getDoubleTy(cgen_state_->context_);
3425  break;
3426  default:
3427  LOG(FATAL) << "Unsupported FP size: " << to_ti.get_size();
3428  }
3429  value = cgen_state_->ir_builder_.CreateSIToFP(value, fp_type);
3430  if (from_ti.get_scale()) {
3431  value = cgen_state_->ir_builder_.CreateFDiv(
3432  value,
3433  llvm::ConstantFP::get(value->getType(), exp_to_scale(from_ti.get_scale())));
3434  }
3435  }
3436  return value;
3437 }
#define LOG(tag)
Definition: Logger.h:200
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
#define AUTOMATIC_IR_METADATA(CGENSTATE)
uint64_t exp_to_scale(const unsigned exp)

+ Here is the call graph for this function:

llvm::Value * Executor::castToIntPtrTyIn ( llvm::Value *  val,
const size_t  bit_width 
)
private

Definition at line 3439 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_LT, and get_int_type().

3439  {
3441  CHECK(val->getType()->isPointerTy());
3442 
3443  const auto val_ptr_type = static_cast<llvm::PointerType*>(val->getType());
3444  const auto val_type = val_ptr_type->getElementType();
3445  size_t val_width = 0;
3446  if (val_type->isIntegerTy()) {
3447  val_width = val_type->getIntegerBitWidth();
3448  } else {
3449  if (val_type->isFloatTy()) {
3450  val_width = 32;
3451  } else {
3452  CHECK(val_type->isDoubleTy());
3453  val_width = 64;
3454  }
3455  }
3456  CHECK_LT(size_t(0), val_width);
3457  if (bitWidth == val_width) {
3458  return val;
3459  }
3460  return cgen_state_->ir_builder_.CreateBitCast(
3461  val, llvm::PointerType::get(get_int_type(bitWidth, cgen_state_->context_), 0));
3462 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:216
#define CHECK(condition)
Definition: Logger.h:206

+ Here is the call graph for this function:

bool Executor::checkCurrentQuerySession ( const std::string &  candidate_query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 3906 of file Execute.cpp.

3907  {
3908  // if current_query_session is equal to the candidate_query_session,
3909  // or it is empty session we consider
3910  return !candidate_query_session.empty() &&
3911  (current_query_session_ == candidate_query_session);
3912 }
static QuerySessionId current_query_session_
Definition: Execute.h:1096
bool Executor::checkIsQuerySessionEnrolled ( const QuerySessionId query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 4209 of file Execute.cpp.

4211  {
4212  if (query_session.empty()) {
4213  return false;
4214  }
4215  return !query_session.empty() && queries_session_map_.count(query_session);
4216 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1102
bool Executor::checkIsQuerySessionInterrupted ( const std::string &  query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 4193 of file Execute.cpp.

4195  {
4196  if (query_session.empty()) {
4197  return false;
4198  }
4199  auto flag_it = queries_interrupt_flag_.find(query_session);
4200  return !query_session.empty() && flag_it != queries_interrupt_flag_.end() &&
4201  flag_it->second;
4202 }
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1100
bool Executor::checkIsRunningQuerySessionInterrupted ( )

Definition at line 4204 of file Execute.cpp.

4204  {
4205  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
4206  return checkIsQuerySessionInterrupted(current_query_session_, session_read_lock);
4207 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1094
bool checkIsQuerySessionInterrupted(const std::string &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
Definition: Execute.cpp:4193
static QuerySessionId current_query_session_
Definition: Execute.h:1096
void Executor::checkPendingQueryStatus ( const QuerySessionId query_session)

Definition at line 3937 of file Execute.cpp.

References ERR_INTERRUPTED, and VLOG.

3937  {
3938  // check whether we are okay to execute the "pending" query
3939  // i.e., before running the query check if this query session is "ALREADY" interrupted
3940  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
3941  if (query_session.empty()) {
3942  return;
3943  }
3944  if (queries_interrupt_flag_.find(query_session) == queries_interrupt_flag_.end()) {
3945  // something goes wrong since we assume this is caller's responsibility
3946  // (call this function only for enrolled query session)
3947  if (!queries_session_map_.count(query_session)) {
3948  VLOG(1) << "Interrupting pending query is not available since the query session is "
3949  "not enrolled";
3950  } else {
3951  // here the query session is enrolled but the interrupt flag is not registered
3952  VLOG(1)
3953  << "Interrupting pending query is not available since its interrupt flag is "
3954  "not registered";
3955  }
3956  return;
3957  }
3958  if (queries_interrupt_flag_[query_session]) {
3960  }
3961 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1094
static QuerySessionMap queries_session_map_
Definition: Execute.h:1102
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1142
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1100
#define VLOG(n)
Definition: Logger.h:300
void Executor::clearMemory ( const Data_Namespace::MemoryLevel  memory_level)
static

Definition at line 192 of file Execute.cpp.

References Data_Namespace::DataMgr::clearMemory(), Data_Namespace::CPU_LEVEL, Catalog_Namespace::SysCatalog::getDataMgr(), Data_Namespace::GPU_LEVEL, Catalog_Namespace::SysCatalog::instance(), and CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCaches().

Referenced by DBHandler::clear_cpu_memory(), DBHandler::clear_gpu_memory(), QueryRunner::QueryRunner::clearCpuMemory(), and QueryRunner::QueryRunner::clearGpuMemory().

192  {
193  switch (memory_level) {
196  mapd_unique_lock<mapd_shared_mutex> flush_lock(
197  execute_mutex_); // Don't flush memory while queries are running
198 
200  if (memory_level == Data_Namespace::MemoryLevel::CPU_LEVEL) {
201  // The hash table cache uses CPU memory not managed by the buffer manager. In the
202  // future, we should manage these allocations with the buffer manager directly.
203  // For now, assume the user wants to purge the hash table cache when they clear
204  // CPU memory (currently used in ExecuteTest to lower memory pressure)
206  }
207  break;
208  }
209  default: {
210  throw std::runtime_error(
211  "Clearing memory levels other than the CPU level or GPU level is not "
212  "supported.");
213  }
214  }
215 }
static mapd_shared_mutex execute_mutex_
Definition: Execute.h:1109
void clearMemory(const MemoryLevel memLevel)
Definition: DataMgr.cpp:384
static void invalidateCaches()
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:194
static SysCatalog & instance()
Definition: SysCatalog.h:293

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::clearMetaInfoCache ( )
private

Definition at line 384 of file Execute.cpp.

References input_table_info_cache_().

384  {
388 }
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:1092
InputTableInfoCache input_table_info_cache_
Definition: Execute.h:1091
TableGenerations table_generations_
Definition: Execute.h:1093

+ Here is the call graph for this function:

void Executor::clearQuerySessionStatus ( const QuerySessionId query_session,
const std::string &  submitted_time_str,
const bool  acquire_spin_lock 
)

Definition at line 3963 of file Execute.cpp.

References executor_id_().

3965  {
3966  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
3967  // clear the interrupt-related info for a finished query
3968  if (query_session.empty()) {
3969  return;
3970  }
3971  removeFromQuerySessionList(query_session, submitted_time_str, session_write_lock);
3972  if (query_session.compare(current_query_session_) == 0 &&
3974  invalidateRunningQuerySession(session_write_lock);
3975  if (acquire_spin_lock) {
3976  // try to unlock executor's internal spin lock (let say "L") iff it is acquired
3977  // otherwise we do not need to care about the "L" lock
3978  // i.e., import table does not have a code path towards Executor
3979  // so we just exploit executor's session management code and also global interrupt
3980  // flag excepting this "L" lock
3981  execute_spin_lock_.clear(std::memory_order_release);
3982  }
3983  resetInterrupt();
3984  }
3985 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1094
void invalidateRunningQuerySession(mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:3914
static std::atomic_flag execute_spin_lock_
Definition: Execute.h:1105
const ExecutorId executor_id_
Definition: Execute.h:1077
static QuerySessionId current_query_session_
Definition: Execute.h:1096
void resetInterrupt()
bool removeFromQuerySessionList(const QuerySessionId &query_session, const std::string &submitted_time_str, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:4138
static size_t running_query_executor_id_
Definition: Execute.h:1098

+ Here is the call graph for this function:

llvm::Value * Executor::codegenAggregateWindowState ( )
private

Definition at line 326 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), Analyzer::WindowFunction::getKind(), kDECIMAL, kDOUBLE, and kFLOAT.

326  {
328  const auto pi32_type =
329  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
330  const auto pi64_type =
331  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
332  const auto window_func_context =
334  const Analyzer::WindowFunction* window_func = window_func_context->getWindowFunction();
335  const auto window_func_ti = get_adjusted_window_type_info(window_func);
336  const auto aggregate_state_type =
337  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
338  auto aggregate_state = aggregateWindowStatePtr();
339  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
340  const auto aggregate_state_count_i64 = cgen_state_->llInt(
341  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
342  auto aggregate_state_count = cgen_state_->ir_builder_.CreateIntToPtr(
343  aggregate_state_count_i64, aggregate_state_type);
344  const auto double_null_lv = cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE));
345  switch (window_func_ti.get_type()) {
346  case kFLOAT: {
347  return cgen_state_->emitCall(
348  "load_avg_float", {aggregate_state, aggregate_state_count, double_null_lv});
349  }
350  case kDOUBLE: {
351  return cgen_state_->emitCall(
352  "load_avg_double", {aggregate_state, aggregate_state_count, double_null_lv});
353  }
354  case kDECIMAL: {
355  return cgen_state_->emitCall(
356  "load_avg_decimal",
357  {aggregate_state,
358  aggregate_state_count,
359  double_null_lv,
360  cgen_state_->llInt<int32_t>(window_func_ti.get_scale())});
361  }
362  default: {
363  return cgen_state_->emitCall(
364  "load_avg_int", {aggregate_state, aggregate_state_count, double_null_lv});
365  }
366  }
367  }
368  if (window_func->getKind() == SqlWindowFunctionKind::COUNT) {
369  return cgen_state_->ir_builder_.CreateLoad(aggregate_state);
370  }
371  switch (window_func_ti.get_type()) {
372  case kFLOAT: {
373  return cgen_state_->emitCall("load_float", {aggregate_state});
374  }
375  case kDOUBLE: {
376  return cgen_state_->emitCall("load_double", {aggregate_state});
377  }
378  default: {
379  return cgen_state_->ir_builder_.CreateLoad(aggregate_state);
380  }
381  }
382 }
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1453
llvm::Value * aggregateWindowStatePtr()
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

void Executor::codegenJoinLoops ( const std::vector< JoinLoop > &  join_loops,
const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
llvm::Function *  query_func,
llvm::BasicBlock *  entry_bb,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const ExecutionOptions eo 
)
private

Definition at line 842 of file IRCodegen.cpp.

References ExecutionOptions::allow_runtime_query_interrupt, AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, JoinLoop::codegen(), CgenState::context_, CgenState::current_func_, CompilationOptions::device_type, CgenState::ir_builder_, CgenState::llInt(), CgenState::needs_error_check_, CodeGenerator::posArg(), GroupByAndAggregate::query_infos_, and ExecutionOptions::with_dynamic_watchdog.

849  {
851  const auto exit_bb =
852  llvm::BasicBlock::Create(cgen_state_->context_, "exit", cgen_state_->current_func_);
853  cgen_state_->ir_builder_.SetInsertPoint(exit_bb);
854  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
855  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
856  CodeGenerator code_generator(this);
857  const auto loops_entry_bb = JoinLoop::codegen(
858  join_loops,
859  /*body_codegen=*/
860  [this,
861  query_func,
862  &query_mem_desc,
863  &co,
864  &eo,
865  &group_by_and_aggregate,
866  &join_loops,
867  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
869  addJoinLoopIterator(prev_iters, join_loops.size());
870  auto& builder = cgen_state_->ir_builder_;
871  const auto loop_body_bb = llvm::BasicBlock::Create(
872  builder.getContext(), "loop_body", builder.GetInsertBlock()->getParent());
873  builder.SetInsertPoint(loop_body_bb);
874  const bool can_return_error =
875  compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
876  if (can_return_error || cgen_state_->needs_error_check_ ||
878  createErrorCheckControlFlow(query_func,
881  co.device_type,
882  group_by_and_aggregate.query_infos_);
883  }
884  return loop_body_bb;
885  },
886  /*outer_iter=*/code_generator.posArg(nullptr),
887  exit_bb,
888  cgen_state_.get());
889  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
890  cgen_state_->ir_builder_.CreateBr(loops_entry_bb);
891 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
static llvm::BasicBlock * codegen(const std::vector< JoinLoop > &join_loops, const std::function< llvm::BasicBlock *(const std::vector< llvm::Value * > &)> &body_codegen, llvm::Value *outer_iter, llvm::BasicBlock *exit_bb, CgenState *cgen_state)
Definition: JoinLoop.cpp:48
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
const std::vector< InputTableInfo > & query_infos_
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:824
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)

+ Here is the call graph for this function:

llvm::BasicBlock * Executor::codegenSkipDeletedOuterTableRow ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 3000 of file NativeCodegen.cpp.

3002  {
3004  if (!co.filter_on_deleted_column) {
3005  return nullptr;
3006  }
3007  CHECK(!ra_exe_unit.input_descs.empty());
3008  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
3009  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
3010  return nullptr;
3011  }
3012  const auto deleted_cd =
3013  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
3014  if (!deleted_cd) {
3015  return nullptr;
3016  }
3017  CHECK(deleted_cd->columnType.is_boolean());
3018  const auto deleted_expr =
3019  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
3020  outer_input_desc.getTableId(),
3021  deleted_cd->columnId,
3022  outer_input_desc.getNestLevel());
3023  CodeGenerator code_generator(this);
3024  const auto is_deleted =
3025  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
3026  const auto is_deleted_bb = llvm::BasicBlock::Create(
3027  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
3028  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3029  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
3030  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3031  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3032  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3033  cgen_state_->ir_builder_.SetInsertPoint(bb);
3034  return bb;
3035 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:206
void Executor::codegenWindowAvgEpilogue ( llvm::Value *  crt_val,
llvm::Value *  window_func_null_val,
llvm::Value *  multiplicity_lv 
)
private

Definition at line 289 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

291  {
293  const auto window_func_context =
295  const auto window_func = window_func_context->getWindowFunction();
296  const auto window_func_ti = get_adjusted_window_type_info(window_func);
297  const auto pi32_type =
298  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
299  const auto pi64_type =
300  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
301  const auto aggregate_state_type =
302  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
303  const auto aggregate_state_count_i64 = cgen_state_->llInt(
304  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
305  auto aggregate_state_count = cgen_state_->ir_builder_.CreateIntToPtr(
306  aggregate_state_count_i64, aggregate_state_type);
307  std::string agg_count_func_name = "agg_count";
308  switch (window_func_ti.get_type()) {
309  case kFLOAT: {
310  agg_count_func_name += "_float";
311  break;
312  }
313  case kDOUBLE: {
314  agg_count_func_name += "_double";
315  break;
316  }
317  default: {
318  break;
319  }
320  }
321  agg_count_func_name += "_skip_val";
322  cgen_state_->emitCall(agg_count_func_name,
323  {aggregate_state_count, crt_val, window_func_null_val});
324 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunction ( const size_t  target_index,
const CompilationOptions co 
)
private

Definition at line 21 of file WindowFunctionIR.cpp.

References WindowProjectNodeContext::activateWindowFunctionContext(), run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, COUNT, CUME_DIST, DENSE_RANK, logger::FATAL, FIRST_VALUE, WindowProjectNodeContext::get(), WindowFunctionContext::getWindowFunction(), LAG, LAST_VALUE, LEAD, LOG, MAX, MIN, NTILE, PERCENT_RANK, RANK, ROW_NUMBER, and SUM.

22  {
24  CodeGenerator code_generator(this);
25  const auto window_func_context =
27  target_index);
28  const auto window_func = window_func_context->getWindowFunction();
29  switch (window_func->getKind()) {
34  return cgen_state_->emitCall("row_number_window_func",
35  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
36  window_func_context->output())),
37  code_generator.posArg(nullptr)});
38  }
41  return cgen_state_->emitCall("percent_window_func",
42  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
43  window_func_context->output())),
44  code_generator.posArg(nullptr)});
45  }
51  const auto& args = window_func->getArgs();
52  CHECK(!args.empty());
53  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
54  CHECK_EQ(arg_lvs.size(), size_t(1));
55  return arg_lvs.front();
56  }
63  }
64  default: {
65  LOG(FATAL) << "Invalid window function kind";
66  }
67  }
68  return nullptr;
69 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
#define LOG(tag)
Definition: Logger.h:200
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
static const WindowProjectNodeContext * get(Executor *executor)
const WindowFunctionContext * activateWindowFunctionContext(Executor *executor, const size_t target_index) const
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenWindowFunctionAggregate(const CompilationOptions &co)
#define CHECK(condition)
Definition: Logger.h:206
const Analyzer::WindowFunction * getWindowFunction() const

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregate ( const CompilationOptions co)
private

Definition at line 140 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, CHECK, WindowProjectNodeContext::get(), get_int_type(), and WindowProjectNodeContext::getActiveWindowFunctionContext().

140  {
142  const auto reset_state_false_bb = codegenWindowResetStateControlFlow();
143  auto aggregate_state = aggregateWindowStatePtr();
144  llvm::Value* aggregate_state_count = nullptr;
145  const auto window_func_context =
147  const auto window_func = window_func_context->getWindowFunction();
148  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
149  const auto aggregate_state_count_i64 = cgen_state_->llInt(
150  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
151  const auto pi64_type =
152  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
153  aggregate_state_count =
154  cgen_state_->ir_builder_.CreateIntToPtr(aggregate_state_count_i64, pi64_type);
155  }
156  codegenWindowFunctionStateInit(aggregate_state);
157  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
158  const auto count_zero = cgen_state_->llInt(int64_t(0));
159  cgen_state_->emitCall("agg_id", {aggregate_state_count, count_zero});
160  }
161  cgen_state_->ir_builder_.CreateBr(reset_state_false_bb);
162  cgen_state_->ir_builder_.SetInsertPoint(reset_state_false_bb);
164  return codegenWindowFunctionAggregateCalls(aggregate_state, co);
165 }
llvm::Value * aggregateWindowStatePtr()
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
static const WindowProjectNodeContext * get(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
void codegenWindowFunctionStateInit(llvm::Value *aggregate_state)
#define CHECK(condition)
Definition: Logger.h:206
llvm::Value * codegenWindowFunctionAggregateCalls(llvm::Value *aggregate_state, const CompilationOptions &co)
llvm::BasicBlock * codegenWindowResetStateControlFlow()

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregateCalls ( llvm::Value *  aggregate_state,
const CompilationOptions co 
)
private

Definition at line 246 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, CodeGenerator::codegen(), CodeGenerator::codegenCastBetweenIntTypes(), COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), anonymous_namespace{WindowFunctionIR.cpp}::get_window_agg_name(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kFLOAT, and SUM.

247  {
249  const auto window_func_context =
251  const auto window_func = window_func_context->getWindowFunction();
252  const auto window_func_ti = get_adjusted_window_type_info(window_func);
253  const auto window_func_null_val =
254  window_func_ti.is_fp()
255  ? cgen_state_->inlineFpNull(window_func_ti)
256  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
257  const auto& args = window_func->getArgs();
258  llvm::Value* crt_val;
259  if (args.empty()) {
260  CHECK(window_func->getKind() == SqlWindowFunctionKind::COUNT);
261  crt_val = cgen_state_->llInt(int64_t(1));
262  } else {
263  CodeGenerator code_generator(this);
264  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
265  CHECK_EQ(arg_lvs.size(), size_t(1));
266  if (window_func->getKind() == SqlWindowFunctionKind::SUM && !window_func_ti.is_fp()) {
267  crt_val = code_generator.codegenCastBetweenIntTypes(
268  arg_lvs.front(), args.front()->get_type_info(), window_func_ti, false);
269  } else {
270  crt_val = window_func_ti.get_type() == kFLOAT
271  ? arg_lvs.front()
272  : cgen_state_->castToTypeIn(arg_lvs.front(), 64);
273  }
274  }
275  const auto agg_name = get_window_agg_name(window_func->getKind(), window_func_ti);
276  llvm::Value* multiplicity_lv = nullptr;
277  if (args.empty()) {
278  cgen_state_->emitCall(agg_name, {aggregate_state, crt_val});
279  } else {
280  cgen_state_->emitCall(agg_name + "_skip_val",
281  {aggregate_state, crt_val, window_func_null_val});
282  }
283  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
284  codegenWindowAvgEpilogue(crt_val, window_func_null_val, multiplicity_lv);
285  }
287 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string get_window_agg_name(const SqlWindowFunctionKind kind, const SQLTypeInfo &window_func_ti)
void codegenWindowAvgEpilogue(llvm::Value *crt_val, llvm::Value *window_func_null_val, llvm::Value *multiplicity_lv)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenAggregateWindowState()
#define CHECK(condition)
Definition: Logger.h:206
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

void Executor::codegenWindowFunctionStateInit ( llvm::Value *  aggregate_state)
private

Definition at line 196 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

196  {
198  const auto window_func_context =
200  const auto window_func = window_func_context->getWindowFunction();
201  const auto window_func_ti = get_adjusted_window_type_info(window_func);
202  const auto window_func_null_val =
203  window_func_ti.is_fp()
204  ? cgen_state_->inlineFpNull(window_func_ti)
205  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
206  llvm::Value* window_func_init_val;
207  if (window_func_context->getWindowFunction()->getKind() ==
209  switch (window_func_ti.get_type()) {
210  case kFLOAT: {
211  window_func_init_val = cgen_state_->llFp(float(0));
212  break;
213  }
214  case kDOUBLE: {
215  window_func_init_val = cgen_state_->llFp(double(0));
216  break;
217  }
218  default: {
219  window_func_init_val = cgen_state_->llInt(int64_t(0));
220  break;
221  }
222  }
223  } else {
224  window_func_init_val = window_func_null_val;
225  }
226  const auto pi32_type =
227  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
228  switch (window_func_ti.get_type()) {
229  case kDOUBLE: {
230  cgen_state_->emitCall("agg_id_double", {aggregate_state, window_func_init_val});
231  break;
232  }
233  case kFLOAT: {
234  aggregate_state =
235  cgen_state_->ir_builder_.CreateBitCast(aggregate_state, pi32_type);
236  cgen_state_->emitCall("agg_id_float", {aggregate_state, window_func_init_val});
237  break;
238  }
239  default: {
240  cgen_state_->emitCall("agg_id", {aggregate_state, window_func_init_val});
241  break;
242  }
243  }
244 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

llvm::BasicBlock * Executor::codegenWindowResetStateControlFlow ( )
private

Definition at line 167 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, WindowProjectNodeContext::getActiveWindowFunctionContext(), CodeGenerator::posArg(), and CodeGenerator::toBool().

167  {
169  const auto window_func_context =
171  const auto bitset = cgen_state_->llInt(
172  reinterpret_cast<const int64_t>(window_func_context->partitionStart()));
173  const auto min_val = cgen_state_->llInt(int64_t(0));
174  const auto max_val = cgen_state_->llInt(window_func_context->elementCount() - 1);
175  const auto null_val = cgen_state_->llInt(inline_int_null_value<int64_t>());
176  const auto null_bool_val = cgen_state_->llInt<int8_t>(inline_int_null_value<int8_t>());
177  CodeGenerator code_generator(this);
178  const auto reset_state =
179  code_generator.toBool(cgen_state_->emitCall("bit_is_set",
180  {bitset,
181  code_generator.posArg(nullptr),
182  min_val,
183  max_val,
184  null_val,
185  null_bool_val}));
186  const auto reset_state_true_bb = llvm::BasicBlock::Create(
187  cgen_state_->context_, "reset_state.true", cgen_state_->current_func_);
188  const auto reset_state_false_bb = llvm::BasicBlock::Create(
189  cgen_state_->context_, "reset_state.false", cgen_state_->current_func_);
190  cgen_state_->ir_builder_.CreateCondBr(
191  reset_state, reset_state_true_bb, reset_state_false_bb);
192  cgen_state_->ir_builder_.SetInsertPoint(reset_state_true_bb);
193  return reset_state_false_bb;
194 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)

+ Here is the call graph for this function:

ResultSetPtr Executor::collectAllDeviceResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner 
)
private

Definition at line 1907 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::build_row_for_empty_input(), catalog_(), DEBUG_TIMER, SharedKernelContext::getFragmentResults(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, NonGroupedAggregate, GroupByAndAggregate::shard_count_for_top_groups(), RelAlgExecutionUnit::target_exprs, and use_speculative_top_n().

1912  {
1913  auto timer = DEBUG_TIMER(__func__);
1914  auto& result_per_device = shared_context.getFragmentResults();
1915  if (result_per_device.empty() && query_mem_desc.getQueryDescriptionType() ==
1918  ra_exe_unit.target_exprs, query_mem_desc, device_type);
1919  }
1920  if (use_speculative_top_n(ra_exe_unit, query_mem_desc)) {
1921  try {
1922  return reduceSpeculativeTopN(
1923  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
1924  } catch (const std::bad_alloc&) {
1925  throw SpeculativeTopNFailed("Failed during multi-device reduction.");
1926  }
1927  }
1928  const auto shard_count =
1929  device_type == ExecutorDeviceType::GPU
1931  : 0;
1932 
1933  if (shard_count && !result_per_device.empty()) {
1934  return collectAllDeviceShardedTopResults(shared_context, ra_exe_unit);
1935  }
1936  return reduceMultiDeviceResults(
1937  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
1938 }
std::vector< Analyzer::Expr * > target_exprs
bool use_speculative_top_n(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc)
ResultSetPtr reduceSpeculativeTopN(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:1012
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1078
ResultSetPtr reduceMultiDeviceResults(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:905
ResultSetPtr collectAllDeviceShardedTopResults(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit) const
Definition: Execute.cpp:2022
QueryDescriptionType getQueryDescriptionType() const
ResultSetPtr build_row_for_empty_input(const std::vector< Analyzer::Expr * > &target_exprs_in, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
Definition: Execute.cpp:1865
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define DEBUG_TIMER(name)
Definition: Logger.h:322
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)

+ Here is the call graph for this function:

ResultSetPtr Executor::collectAllDeviceShardedTopResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit 
) const
private

Definition at line 2022 of file Execute.cpp.

References catalog_(), CHECK, CHECK_EQ, CHECK_LE, SharedKernelContext::getFragmentResults(), SortInfo::limit, SortInfo::offset, SortInfo::order_entries, anonymous_namespace{Execute.cpp}::permute_storage_columnar(), anonymous_namespace{Execute.cpp}::permute_storage_row_wise(), run_benchmark_import::result, and RelAlgExecutionUnit::sort_info.

2024  {
2025  auto& result_per_device = shared_context.getFragmentResults();
2026  const auto first_result_set = result_per_device.front().first;
2027  CHECK(first_result_set);
2028  auto top_query_mem_desc = first_result_set->getQueryMemDesc();
2029  CHECK(!top_query_mem_desc.hasInterleavedBinsOnGpu());
2030  const auto top_n = ra_exe_unit.sort_info.limit + ra_exe_unit.sort_info.offset;
2031  top_query_mem_desc.setEntryCount(0);
2032  for (auto& result : result_per_device) {
2033  const auto result_set = result.first;
2034  CHECK(result_set);
2035  result_set->sort(ra_exe_unit.sort_info.order_entries, top_n, this);
2036  size_t new_entry_cnt = top_query_mem_desc.getEntryCount() + result_set->rowCount();
2037  top_query_mem_desc.setEntryCount(new_entry_cnt);
2038  }
2039  auto top_result_set = std::make_shared<ResultSet>(first_result_set->getTargetInfos(),
2040  first_result_set->getDeviceType(),
2041  top_query_mem_desc,
2042  first_result_set->getRowSetMemOwner(),
2043  catalog_,
2044  blockSize(),
2045  gridSize());
2046  auto top_storage = top_result_set->allocateStorage();
2047  size_t top_output_row_idx{0};
2048  for (auto& result : result_per_device) {
2049  const auto result_set = result.first;
2050  CHECK(result_set);
2051  const auto& top_permutation = result_set->getPermutationBuffer();
2052  CHECK_LE(top_permutation.size(), top_n);
2053  if (top_query_mem_desc.didOutputColumnar()) {
2054  top_output_row_idx = permute_storage_columnar(result_set->getStorage(),
2055  result_set->getQueryMemDesc(),
2056  top_storage,
2057  top_output_row_idx,
2058  top_query_mem_desc,
2059  top_permutation);
2060  } else {
2061  top_output_row_idx = permute_storage_row_wise(result_set->getStorage(),
2062  top_storage,
2063  top_output_row_idx,
2064  top_query_mem_desc,
2065  top_permutation);
2066  }
2067  }
2068  CHECK_EQ(top_output_row_idx, top_query_mem_desc.getEntryCount());
2069  return top_result_set;
2070 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
const std::list< Analyzer::OrderEntry > order_entries
size_t permute_storage_row_wise(const ResultSetStorage *input_storage, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:2001
const size_t limit
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1078
#define CHECK_LE(x, y)
Definition: Logger.h:217
unsigned gridSize() const
Definition: Execute.cpp:3379
size_t permute_storage_columnar(const ResultSetStorage *input_storage, const QueryMemoryDescriptor &input_query_mem_desc, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:1951
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define CHECK(condition)
Definition: Logger.h:206
unsigned blockSize() const
Definition: Execute.cpp:3393
const size_t offset

+ Here is the call graph for this function:

bool Executor::compileBody ( const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const GpuSharedMemoryContext gpu_smem_context = {} 
)
private

Definition at line 3037 of file NativeCodegen.cpp.

3041  {
3043 
3044  // Switch the code generation into a separate filter function if enabled.
3045  // Note that accesses to function arguments are still codegenned from the
3046  // row function's arguments, then later automatically forwarded and
3047  // remapped into filter function arguments by redeclareFilterFunction().
3048  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3049  llvm::Value* loop_done{nullptr};
3050  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3051  if (cgen_state_->filter_func_) {
3052  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3053  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3054  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3055  row_func_entry_bb->begin());
3056  loop_done = cgen_state_->ir_builder_.CreateAlloca(
3057  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
3058  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3059  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
3060  }
3061  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3062  cgen_state_->current_func_ = cgen_state_->filter_func_;
3063  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3064  }
3065 
3066  // generate the code for the filter
3067  std::vector<Analyzer::Expr*> primary_quals;
3068  std::vector<Analyzer::Expr*> deferred_quals;
3069  bool short_circuited = CodeGenerator::prioritizeQuals(
3070  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3071  if (short_circuited) {
3072  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
3073  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
3074  << " quals";
3075  }
3076  llvm::Value* filter_lv = cgen_state_->llBool(true);
3077  CodeGenerator code_generator(this);
3078  for (auto expr : primary_quals) {
3079  // Generate the filter for primary quals
3080  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
3081  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3082  }
3083  CHECK(filter_lv->getType()->isIntegerTy(1));
3084  llvm::BasicBlock* sc_false{nullptr};
3085  if (!deferred_quals.empty()) {
3086  auto sc_true = llvm::BasicBlock::Create(
3087  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
3088  sc_false = llvm::BasicBlock::Create(
3089  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
3090  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3091  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3092  if (ra_exe_unit.join_quals.empty()) {
3093  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3094  }
3095  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3096  filter_lv = cgen_state_->llBool(true);
3097  }
3098  for (auto expr : deferred_quals) {
3099  filter_lv = cgen_state_->ir_builder_.CreateAnd(
3100  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
3101  }
3102 
3103  CHECK(filter_lv->getType()->isIntegerTy(1));
3104  auto ret = group_by_and_aggregate.codegen(
3105  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3106 
3107  // Switch the code generation back to the row function if a filter
3108  // function was enabled.
3109  if (cgen_state_->filter_func_) {
3110  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3111  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
3112  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3113  }
3114 
3115  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3116  cgen_state_->current_func_ = cgen_state_->row_func_;
3117  cgen_state_->filter_func_call_ =
3118  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3119 
3120  // Create real filter function declaration after placeholder call
3121  // is emitted.
3123 
3124  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3125  auto loop_done_true = llvm::BasicBlock::Create(
3126  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
3127  auto loop_done_false = llvm::BasicBlock::Create(
3128  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
3129  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
3130  cgen_state_->ir_builder_.CreateCondBr(
3131  loop_done_flag, loop_done_true, loop_done_false);
3132  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3133  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3134  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3135  } else {
3136  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3137  }
3138  }
3139  return ret;
3140 }
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
#define CHECK(condition)
Definition: Logger.h:206
void redeclareFilterFunction()
Definition: IRCodegen.cpp:725
#define VLOG(n)
Definition: Logger.h:300
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > Executor::compileWorkUnit ( const std::vector< InputTableInfo > &  query_infos,
const PlanState::DeletedColumnsMap deleted_cols_map,
const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const CudaMgr_Namespace::CudaMgr cuda_mgr,
const bool  allow_lazy_fetch,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const size_t  max_groups_buffer_entry_count,
const int8_t  crt_min_byte_width,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache,
RenderInfo render_info = nullptr 
)
private

Definition at line 2526 of file NativeCodegen.cpp.

2538  {
2539  auto timer = DEBUG_TIMER(__func__);
2540 
2542  const auto cuda_mgr = data_mgr_->getCudaMgr();
2543  if (!cuda_mgr) {
2544  throw QueryMustRunOnCpu();
2545  }
2546  }
2547 
2548 #ifndef NDEBUG
2549  static std::uint64_t counter = 0;
2550  ++counter;
2551  VLOG(1) << "CODEGEN #" << counter << ":";
2552  LOG(IR) << "CODEGEN #" << counter << ":";
2553  LOG(PTX) << "CODEGEN #" << counter << ":";
2554  LOG(ASM) << "CODEGEN #" << counter << ":";
2555 #endif
2556 
2557  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2558 
2559  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2560 
2561  GroupByAndAggregate group_by_and_aggregate(
2562  this,
2563  co.device_type,
2564  ra_exe_unit,
2565  query_infos,
2566  row_set_mem_owner,
2567  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2568  : std::nullopt);
2569  auto query_mem_desc =
2570  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2571  max_groups_buffer_entry_guess,
2572  crt_min_byte_width,
2573  render_info,
2575 
2576  if (query_mem_desc->getQueryDescriptionType() ==
2578  !has_cardinality_estimation &&
2579  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2580  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2581  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2582  }
2583 
2584  const bool output_columnar = query_mem_desc->didOutputColumnar();
2585  const bool gpu_shared_mem_optimization =
2587  ra_exe_unit,
2588  cuda_mgr,
2589  co.device_type,
2590  cuda_mgr ? this->blockSize() : 1,
2591  cuda_mgr ? this->numBlocksPerMP() : 1);
2592  if (gpu_shared_mem_optimization) {
2593  // disable interleaved bins optimization on the GPU
2594  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2595  LOG(DEBUG1) << "GPU shared memory is used for the " +
2596  query_mem_desc->queryDescTypeToString() + " query(" +
2597  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2598  query_mem_desc.get())) +
2599  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2600  }
2601 
2602  const GpuSharedMemoryContext gpu_smem_context(
2603  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2604 
2606  const size_t num_count_distinct_descs =
2607  query_mem_desc->getCountDistinctDescriptorsSize();
2608  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2609  const auto& count_distinct_descriptor =
2610  query_mem_desc->getCountDistinctDescriptor(i);
2611  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2612  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2613  !co.hoist_literals)) {
2614  throw QueryMustRunOnCpu();
2615  }
2616  }
2617  }
2618 
2619  // Read the module template and target either CPU or GPU
2620  // by binding the stream position functions to the right implementation:
2621  // stride access for GPU, contiguous for CPU
2622  auto rt_module_copy = llvm::CloneModule(
2623  *g_rt_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
2624  auto func = llvm::dyn_cast<llvm::Function>(gv);
2625  if (!func) {
2626  return true;
2627  }
2628  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2629  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2631  });
2633  if (is_udf_module_present(true)) {
2635  }
2636  if (is_rt_udf_module_present(true)) {
2638  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2639  }
2640  } else {
2641  rt_module_copy->setDataLayout(get_gpu_data_layout());
2642  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2643  if (is_udf_module_present()) {
2645  }
2646  if (is_rt_udf_module_present()) {
2648  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2649  }
2650  }
2651 
2652  cgen_state_->module_ = rt_module_copy.release();
2654 
2655  auto agg_fnames =
2656  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2657 
2658  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2659 
2660  const bool is_group_by{query_mem_desc->isGroupBy()};
2661  auto [query_func, row_func_call] = is_group_by
2663  co.hoist_literals,
2664  *query_mem_desc,
2665  co.device_type,
2666  ra_exe_unit.scan_limit,
2667  gpu_smem_context)
2668  : query_template(cgen_state_->module_,
2669  agg_slot_count,
2670  co.hoist_literals,
2671  !!ra_exe_unit.estimator,
2672  gpu_smem_context);
2673  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2674  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2675  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2676 
2677  cgen_state_->query_func_ = query_func;
2678  cgen_state_->row_func_call_ = row_func_call;
2679  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2680  &query_func->getEntryBlock().front());
2681 
2682  // Generate the function signature and column head fetches s.t.
2683  // double indirection isn't needed in the inner loop
2684  auto& fetch_bb = query_func->front();
2685  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2686  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2687  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2688  query_func->args().begin(),
2689  fetch_ir_builder,
2690  cgen_state_->context_);
2691  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2692 
2693  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2694  is_group_by ? 0 : agg_slot_count,
2695  co.hoist_literals,
2696  cgen_state_->module_,
2697  cgen_state_->context_);
2698  CHECK(cgen_state_->row_func_);
2699  cgen_state_->row_func_bb_ =
2700  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2701 
2703  auto filter_func_ft =
2704  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2705  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2706  llvm::Function::ExternalLinkage,
2707  "filter_func",
2708  cgen_state_->module_);
2709  CHECK(cgen_state_->filter_func_);
2710  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2711  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2712  }
2713 
2714  cgen_state_->current_func_ = cgen_state_->row_func_;
2715  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2716 
2717  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2718  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2719  const auto join_loops =
2720  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2721 
2722  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2723  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2724  if (is_not_deleted_bb) {
2725  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2726  }
2727  if (!join_loops.empty()) {
2728  codegenJoinLoops(join_loops,
2729  body_execution_unit,
2730  group_by_and_aggregate,
2731  query_func,
2732  cgen_state_->row_func_bb_,
2733  *(query_mem_desc.get()),
2734  co,
2735  eo);
2736  } else {
2737  const bool can_return_error = compileBody(
2738  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2739  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2741  createErrorCheckControlFlow(query_func,
2744  co.device_type,
2745  group_by_and_aggregate.query_infos_);
2746  }
2747  }
2748  std::vector<llvm::Value*> hoisted_literals;
2749 
2750  if (co.hoist_literals) {
2751  VLOG(1) << "number of hoisted literals: "
2752  << cgen_state_->query_func_literal_loads_.size()
2753  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2754  << " bytes";
2755  }
2756 
2757  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2758  // we have some hoisted literals...
2759  hoisted_literals = inlineHoistedLiterals();
2760  }
2761 
2762  // replace the row func placeholder call with the call to the actual row func
2763  std::vector<llvm::Value*> row_func_args;
2764  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2765  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2766  }
2767  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2768  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2769  // push hoisted literals arguments, if any
2770  row_func_args.insert(
2771  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2772  llvm::ReplaceInstWithInst(
2773  cgen_state_->row_func_call_,
2774  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2775 
2776  // replace the filter func placeholder call with the call to the actual filter func
2777  if (cgen_state_->filter_func_) {
2778  std::vector<llvm::Value*> filter_func_args;
2779  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2780  arg_it != cgen_state_->filter_func_args_.end();
2781  ++arg_it) {
2782  filter_func_args.push_back(arg_it->first);
2783  }
2784  llvm::ReplaceInstWithInst(
2785  cgen_state_->filter_func_call_,
2786  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2787  }
2788 
2789  // Aggregate
2790  plan_state_->init_agg_vals_ =
2791  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2792 
2793  /*
2794  * If we have decided to use GPU shared memory (decision is not made here), then
2795  * we generate proper code for extra components that it needs (buffer initialization and
2796  * gpu reduction from shared memory to global memory). We then replace these functions
2797  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2798  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2799  */
2800  if (gpu_smem_context.isSharedMemoryUsed()) {
2801  if (query_mem_desc->getQueryDescriptionType() ==
2803  GpuSharedMemCodeBuilder gpu_smem_code(
2804  cgen_state_->module_,
2805  cgen_state_->context_,
2806  *query_mem_desc,
2808  plan_state_->init_agg_vals_);
2809  gpu_smem_code.codegen();
2810  gpu_smem_code.injectFunctionsInto(query_func);
2811 
2812  // helper functions are used for caching purposes later
2813  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2814  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2815  LOG(IR) << gpu_smem_code.toString();
2816  }
2817  }
2818 
2819  auto multifrag_query_func = cgen_state_->module_->getFunction(
2820  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2821  CHECK(multifrag_query_func);
2822 
2825  multifrag_query_func, co.hoist_literals, eo.allow_runtime_query_interrupt);
2826  }
2827 
2828  bind_query(query_func,
2829  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2830  multifrag_query_func,
2831  cgen_state_->module_);
2832 
2833  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2834  if (cgen_state_->filter_func_) {
2835  root_funcs.push_back(cgen_state_->filter_func_);
2836  }
2837  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2838  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2839 
2840  // Always inline the row function and the filter function.
2841  // We don't want register spills in the inner loops.
2842  // LLVM seems to correctly free up alloca instructions
2843  // in these functions even when they are inlined.
2845  if (cgen_state_->filter_func_) {
2847  }
2848 
2849 #ifndef NDEBUG
2850  // Add helpful metadata to the LLVM IR for debugging.
2852 #endif
2853 
2854  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2855  std::string llvm_ir;
2856  if (eo.just_explain) {
2858 #ifdef WITH_JIT_DEBUG
2859  throw std::runtime_error(
2860  "Explain optimized not available when JIT runtime debug symbols are enabled");
2861 #else
2862  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2863  // optimized IR after NVVM reflect
2864  llvm::legacy::PassManager pass_manager;
2865  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2866 #endif // WITH_JIT_DEBUG
2867  }
2868  llvm_ir =
2869  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2870  serialize_llvm_object(cgen_state_->row_func_) +
2871  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2872  : "");
2873 
2874 #ifndef NDEBUG
2875  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2876 #endif
2877  }
2878 
2879  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2880  LOG(IR) << "IR for the "
2881  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2882 #ifdef NDEBUG
2883  LOG(IR) << serialize_llvm_object(query_func)
2884  << serialize_llvm_object(cgen_state_->row_func_)
2885  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2886  : "")
2887  << "\nEnd of IR";
2888 #else
2889  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2890 #endif
2891 
2892  // Run some basic validation checks on the LLVM IR before code is generated below.
2893  verify_function_ir(cgen_state_->row_func_);
2894  if (cgen_state_->filter_func_) {
2895  verify_function_ir(cgen_state_->filter_func_);
2896  }
2897 
2898  // Generate final native code from the LLVM IR.
2899  return std::make_tuple(
2902  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2903  : optimizeAndCodegenGPU(query_func,
2904  multifrag_query_func,
2905  live_funcs,
2906  is_group_by || ra_exe_unit.estimator,
2907  cuda_mgr,
2908  co),
2909  cgen_state_->getLiterals(),
2910  output_columnar,
2911  llvm_ir,
2912  std::move(gpu_smem_context)},
2913  std::move(query_mem_desc));
2914 }
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:208
std::vector< Analyzer::Expr * > target_exprs
bool is_udf_module_present(bool cpu_only=false)
#define CHECK_EQ(x, y)
Definition: Logger.h:214
std::unique_ptr< llvm::Module > rt_udf_cpu_module
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1079
void codegenJoinLoops(const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
Definition: IRCodegen.cpp:842
std::unique_ptr< llvm::Module > udf_gpu_module
#define LOG(tag)
Definition: Logger.h:200
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
unsigned numBlocksPerMP() const
Definition: Execute.cpp:3388
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
void addTransientStringLiterals(const RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< RowSetMemoryOwner > &row_set_mem_owner)
Definition: Execute.cpp:1726
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
void preloadFragOffsets(const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
Definition: Execute.cpp:3320
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
llvm::StringRef get_gpu_target_triple_string()
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
void verify_function_ir(const llvm::Function *func)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
std::string serialize_llvm_object(const T *llvm_obj)
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
bool g_enable_filter_function
Definition: Execute.cpp:78
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
void nukeOldState(const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
Definition: Execute.cpp:3301
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
#define CHECK(condition)
Definition: Logger.h:206
#define DEBUG_TIMER(name)
Definition: Logger.h:322
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:289
unsigned blockSize() const
Definition: Execute.cpp:3393
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool is_rt_udf_module_present(bool cpu_only=false)
#define VLOG(n)
Definition: Logger.h:300
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:119
AggregatedColRange Executor::computeColRangesCache ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 3796 of file Execute.cpp.

References catalog_(), CHECK, getLeafColumnRange(), AggregatedColRange::setColRange(), and ExpressionRange::typeSupportsRange().

3797  {
3798  AggregatedColRange agg_col_range_cache;
3799  CHECK(catalog_);
3800  std::unordered_set<int> phys_table_ids;
3801  for (const auto& phys_input : phys_inputs) {
3802  phys_table_ids.insert(phys_input.table_id);
3803  }
3804  std::vector<InputTableInfo> query_infos;
3805  for (const int table_id : phys_table_ids) {
3806  query_infos.emplace_back(InputTableInfo{table_id, getTableInfo(table_id)});
3807  }
3808  for (const auto& phys_input : phys_inputs) {
3809  const auto cd =
3810  catalog_->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
3811  CHECK(cd);
3812  if (ExpressionRange::typeSupportsRange(cd->columnType)) {
3813  const auto col_var = boost::make_unique<Analyzer::ColumnVar>(
3814  cd->columnType, phys_input.table_id, phys_input.col_id, 0);
3815  const auto col_range = getLeafColumnRange(col_var.get(), query_infos, this, false);
3816  agg_col_range_cache.setColRange(phys_input, col_range);
3817  }
3818  }
3819  return agg_col_range_cache;
3820 }
Fragmenter_Namespace::TableInfo getTableInfo(const int table_id) const
Definition: Execute.cpp:301
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1078
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
ExpressionRange getLeafColumnRange(const Analyzer::ColumnVar *col_expr, const std::vector< InputTableInfo > &query_infos, const Executor *executor, const bool is_outer_join_proj)
#define CHECK(condition)
Definition: Logger.h:206
void setColRange(const PhysicalInput &, const ExpressionRange &)
static bool typeSupportsRange(const SQLTypeInfo &ti)

+ Here is the call graph for this function:

StringDictionaryGenerations Executor::computeStringDictionaryGenerations ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 3822 of file Execute.cpp.

References catalog_(), CHECK, kENCODING_DICT, and StringDictionaryGenerations::setGeneration().

3823  {
3824  StringDictionaryGenerations string_dictionary_generations;
3825  CHECK(catalog_);
3826  for (const auto& phys_input : phys_inputs) {
3827  const auto cd =
3828  catalog_->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
3829  CHECK(cd);
3830  const auto& col_ti =
3831  cd->columnType.is_array() ? cd->columnType.get_elem_type() : cd->columnType;
3832  if (col_ti.is_string() && col_ti.get_compression() == kENCODING_DICT) {
3833  const int dict_id = col_ti.get_comp_param();
3834  const auto dd = catalog_->getMetadataForDict(dict_id);
3835  CHECK(dd && dd->stringDict);
3836  string_dictionary_generations.setGeneration(dict_id,
3837  dd->stringDict->storageEntryCount());
3838  }
3839  }
3840  return string_dictionary_generations;
3841 }
void setGeneration(const uint32_t id, const uint64_t generation)
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1078
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
const DictDescriptor * getMetadataForDict(int dict_ref, bool loadDict=true) const
Definition: Catalog.cpp:1522
#define CHECK(condition)
Definition: Logger.h:206

+ Here is the call graph for this function:

TableGenerations Executor::computeTableGenerations ( std::unordered_set< int >  phys_table_ids)
private

Definition at line 3843 of file Execute.cpp.

References TableGenerations::setGeneration().

3844  {
3845  TableGenerations table_generations;
3846  for (const int table_id : phys_table_ids) {
3847  const auto table_info = getTableInfo(table_id);
3848  table_generations.setGeneration(
3849  table_id,
3850  TableGeneration{static_cast<int64_t>(table_info.getPhysicalNumTuples()), 0});
3851  }
3852  return table_generations;
3853 }
Fragmenter_Namespace::TableInfo getTableInfo(const int table_id) const
Definition: Execute.cpp:301
void setGeneration(const uint32_t id, const TableGeneration &generation)

+ Here is the call graph for this function:

bool Executor::containsLeftDeepOuterJoin ( ) const
inline

Definition at line 419 of file Execute.h.

References cgen_state_.

419  {
420  return cgen_state_->contains_left_deep_outer_join_;
421  }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
void Executor::createErrorCheckControlFlow ( llvm::Function *  query_func,
bool  run_with_dynamic_watchdog,
bool  run_with_allowing_runtime_interrupt,
ExecutorDeviceType  device_type,
const std::vector< InputTableInfo > &  input_table_infos 
)
private

Definition at line 1880 of file NativeCodegen.cpp.

1885  {
1887 
1888  // check whether the row processing was successful; currently, it can
1889  // fail by running out of group by buffer slots
1890 
1891  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1892  // when both dynamic watchdog and runtime interrupt turns on
1893  // we use dynamic watchdog
1894  run_with_allowing_runtime_interrupt = false;
1895  }
1896 
1897  {
1898  // disable injecting query interrupt checker if the session info is invalid
1899  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1900  if (current_query_session_.empty()) {
1901  run_with_allowing_runtime_interrupt = false;
1902  }
1903  }
1904 
1905  llvm::Value* row_count = nullptr;
1906  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1907  device_type == ExecutorDeviceType::GPU) {
1908  row_count =
1909  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1910  }
1911 
1912  bool done_splitting = false;
1913  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1914  ++bb_it) {
1915  llvm::Value* pos = nullptr;
1916  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1917  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1918  llvm::isa<llvm::PHINode>(*inst_it)) {
1919  if (inst_it->getName() == "pos") {
1920  pos = &*inst_it;
1921  }
1922  continue;
1923  }
1924  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1925  continue;
1926  }
1927  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1928  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1929  auto next_inst_it = inst_it;
1930  ++next_inst_it;
1931  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1932  auto& br_instr = bb_it->back();
1933  llvm::IRBuilder<> ir_builder(&br_instr);
1934  llvm::Value* err_lv = &*inst_it;
1935  llvm::Value* err_lv_returned_from_row_func = nullptr;
1936  if (run_with_dynamic_watchdog) {
1937  CHECK(pos);
1938  llvm::Value* call_watchdog_lv = nullptr;
1939  if (device_type == ExecutorDeviceType::GPU) {
1940  // In order to make sure all threads within a block see the same barrier,
1941  // only those blocks whose none of their threads have experienced the critical
1942  // edge will go through the dynamic watchdog computation
1943  CHECK(row_count);
1944  auto crit_edge_rem =
1945  (blockSize() & (blockSize() - 1))
1946  ? ir_builder.CreateSRem(
1947  row_count,
1948  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1949  : ir_builder.CreateAnd(
1950  row_count,
1951  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1952  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1953  crit_edge_threshold->setName("crit_edge_threshold");
1954 
1955  // only those threads where pos < crit_edge_threshold go through dynamic
1956  // watchdog call
1957  call_watchdog_lv =
1958  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1959  } else {
1960  // CPU path: run watchdog for every 64th row
1961  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1962  call_watchdog_lv = ir_builder.CreateICmp(
1963  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1964  }
1965  CHECK(call_watchdog_lv);
1966  auto error_check_bb = bb_it->splitBasicBlock(
1967  llvm::BasicBlock::iterator(br_instr), ".error_check");
1968  auto& watchdog_br_instr = bb_it->back();
1969 
1970  auto watchdog_check_bb = llvm::BasicBlock::Create(
1971  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1972  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1973  auto detected_timeout = watchdog_ir_builder.CreateCall(
1974  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1975  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1976  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1977  watchdog_ir_builder.CreateBr(error_check_bb);
1978 
1979  llvm::ReplaceInstWithInst(
1980  &watchdog_br_instr,
1981  llvm::BranchInst::Create(
1982  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1983  ir_builder.SetInsertPoint(&br_instr);
1984  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1985 
1986  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1987  unified_err_lv->addIncoming(err_lv, &*bb_it);
1988  err_lv = unified_err_lv;
1989  } else if (run_with_allowing_runtime_interrupt) {
1990  CHECK(pos);
1991  llvm::Value* call_check_interrupt_lv = nullptr;
1992  if (device_type == ExecutorDeviceType::GPU) {
1993  // approximate how many times the %pos variable
1994  // is increased --> the number of iteration
1995  // here we calculate the # bit shift by considering grid/block/fragment sizes
1996  // since if we use the fixed one (i.e., per 64-th increment)
1997  // some CUDA threads cannot enter the interrupt checking block depending on
1998  // the fragment size --> a thread may not take care of 64 threads if an outer
1999  // table is not sufficiently large, and so cannot be interrupted
2000  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
2001  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
2002  int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
2003  uint64_t interrupt_checking_freq = 32;
2004  auto freq_control_knob = g_running_query_interrupt_freq;
2005  CHECK_GT(freq_control_knob, 0);
2006  CHECK_LE(freq_control_knob, 1.0);
2007  if (!input_table_infos.empty()) {
2008  const auto& outer_table_info = *input_table_infos.begin();
2009  auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
2010  if (outer_table_info.table_id < 0) {
2011  auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
2012  CHECK(rs);
2013  num_outer_table_tuples = rs->entryCount();
2014  } else {
2015  auto num_frags = outer_table_info.info.fragments.size();
2016  if (num_frags > 0) {
2017  num_outer_table_tuples =
2018  outer_table_info.info.fragments.begin()->getNumTuples();
2019  }
2020  }
2021  if (num_outer_table_tuples > 0) {
2022  // gridSize * blockSize --> pos_step (idx of the next row per thread)
2023  // we additionally multiply two to pos_step since the number of
2024  // dispatched blocks are double of the gridSize
2025  // # tuples (of fragment) / pos_step --> maximum # increment (K)
2026  // also we multiply 1 / freq_control_knob to K to control the frequency
2027  // So, needs to check the interrupt status more frequently? make K smaller
2028  auto max_inc = uint64_t(
2029  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
2030  if (max_inc < 2) {
2031  // too small `max_inc`, so this correction is necessary to make
2032  // `interrupt_checking_freq` be valid (i.e., larger than zero)
2033  max_inc = 2;
2034  }
2035  auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
2036  interrupt_checking_freq =
2037  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
2038  // add the coverage when interrupt_checking_freq > K
2039  // if so, some threads still cannot be branched to the interrupt checker
2040  // so we manually use smaller but close to the max_inc as freq
2041  if (interrupt_checking_freq > max_inc) {
2042  interrupt_checking_freq = max_inc / 2;
2043  }
2044  if (interrupt_checking_freq < 8) {
2045  // such small freq incurs too frequent interrupt status checking,
2046  // so we fixup to the minimum freq value at some reasonable degree
2047  interrupt_checking_freq = 8;
2048  }
2049  }
2050  }
2051  VLOG(1) << "Set the running query interrupt checking frequency: "
2052  << interrupt_checking_freq;
2053  // check the interrupt flag for every interrupt_checking_freq-th iteration
2054  llvm::Value* pos_shifted_per_iteration =
2055  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
2056  auto interrupt_predicate =
2057  ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
2058  call_check_interrupt_lv =
2059  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2060  interrupt_predicate,
2061  cgen_state_->llInt(int64_t(0LL)));
2062  } else {
2063  // CPU path: run interrupt checker for every 64th row
2064  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
2065  call_check_interrupt_lv =
2066  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2067  interrupt_predicate,
2068  cgen_state_->llInt(int64_t(0LL)));
2069  }
2070  CHECK(call_check_interrupt_lv);
2071  auto error_check_bb = bb_it->splitBasicBlock(
2072  llvm::BasicBlock::iterator(br_instr), ".error_check");
2073  auto& check_interrupt_br_instr = bb_it->back();
2074 
2075  auto interrupt_check_bb = llvm::BasicBlock::Create(
2076  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2077  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2078  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2079  cgen_state_->module_->getFunction("check_interrupt"), {});
2080  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
2081  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
2082  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2083 
2084  llvm::ReplaceInstWithInst(
2085  &check_interrupt_br_instr,
2086  llvm::BranchInst::Create(
2087  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
2088  ir_builder.SetInsertPoint(&br_instr);
2089  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2090 
2091  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
2092  unified_err_lv->addIncoming(err_lv, &*bb_it);
2093  err_lv = unified_err_lv;
2094  }
2095  if (!err_lv_returned_from_row_func) {
2096  err_lv_returned_from_row_func = err_lv;
2097  }
2098  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
2099  // let kernel execution finish as expected, regardless of the observed error,
2100  // unless it is from the dynamic watchdog where all threads within that block
2101  // return together.
2102  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2103  err_lv,
2105  } else {
2106  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
2107  err_lv,
2108  cgen_state_->llInt(static_cast<int32_t>(0)));
2109  }
2110  auto error_bb = llvm::BasicBlock::Create(
2111  cgen_state_->context_, ".error_exit", query_func, new_bb);
2112  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
2113  llvm::CallInst::Create(
2114  cgen_state_->module_->getFunction("record_error_code"),
2115  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
2116  "",
2117  error_bb);
2118  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2119  llvm::ReplaceInstWithInst(&br_instr,
2120  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2121  done_splitting = true;
2122  break;
2123  }
2124  }
2125  }
2126  CHECK(done_splitting);
2127 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1094
double g_running_query_interrupt_freq
Definition: Execute.cpp:118
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1142
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1034
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:76
#define CHECK_GT(x, y)
Definition: Logger.h:218
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1141
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static QuerySessionId current_query_session_
Definition: Execute.h:1096
#define CHECK_LE(x, y)
Definition: Logger.h:217
unsigned gridSize() const
Definition: Execute.cpp:3379
#define CHECK(condition)
Definition: Logger.h:206
unsigned blockSize() const
Definition: Execute.cpp:3393
#define VLOG(n)
Definition: Logger.h:300
std::vector< std::unique_ptr< ExecutionKernel > > Executor::createKernels ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
ColumnFetcher column_fetcher,
const std::vector< InputTableInfo > &  table_infos,
const ExecutionOptions eo,
const bool  is_agg,
const bool  allow_single_frag_table_opt,
const size_t  context_count,
const QueryCompilationDescriptor query_comp_desc,
const QueryMemoryDescriptor query_mem_desc,
RenderInfo render_info,
std::unordered_set< int > &  available_gpus,
int &  available_cpus 
)
private

Determines execution dispatch mode and required fragments for a given query step, then creates kernels to execute the query and returns them for launch.

Definition at line 2098 of file Execute.cpp.

References ExecutionOptions::allow_multifrag, catalog_(), CHECK, CHECK_GE, CHECK_GT, anonymous_namespace{Execute.cpp}::checkWorkUnitWatchdog(), data_mgr_(), g_inner_join_fragment_skipping, QueryCompilationDescriptor::getDeviceType(), QueryMemoryDescriptor::getEntryCount(), SharedKernelContext::getFragOffsets(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, ExecutionOptions::gpu_input_mem_limit_percent, Data_Namespace::GPU_LEVEL, anonymous_namespace{Execute.cpp}::has_lazy_fetched_columns(), logger::INFO, RelAlgExecutionUnit::input_descs, KernelPerFragment, LOG, MultifragmentKernel, ExecutionOptions::outer_fragment_indices, Projection, query_mem_desc, RelAlgExecutionUnit::target_exprs, QueryMemoryDescriptor::toString(), RelAlgExecutionUnit::use_bump_allocator, VLOG, and ExecutionOptions::with_watchdog.

2111  {
2112  std::vector<std::unique_ptr<ExecutionKernel>> execution_kernels;
2113 
2114  QueryFragmentDescriptor fragment_descriptor(
2115  ra_exe_unit,
2116  table_infos,
2117  query_comp_desc.getDeviceType() == ExecutorDeviceType::GPU
2119  : std::vector<Data_Namespace::MemoryInfo>{},
2122  CHECK(!ra_exe_unit.input_descs.empty());
2123 
2124  const auto device_type = query_comp_desc.getDeviceType();
2125  const bool uses_lazy_fetch =
2126  plan_state_->allow_lazy_fetch_ &&
2128  const bool use_multifrag_kernel = (device_type == ExecutorDeviceType::GPU) &&
2129  eo.allow_multifrag && (!uses_lazy_fetch || is_agg);
2130  const auto device_count = deviceCount(device_type);
2131  CHECK_GT(device_count, 0);
2132 
2133  fragment_descriptor.buildFragmentKernelMap(ra_exe_unit,
2134  shared_context.getFragOffsets(),
2135  device_count,
2136  device_type,
2137  use_multifrag_kernel,
2139  this);
2140  if (eo.with_watchdog && fragment_descriptor.shouldCheckWorkUnitWatchdog()) {
2141  checkWorkUnitWatchdog(ra_exe_unit, table_infos, *catalog_, device_type, device_count);
2142  }
2143 
2144  if (use_multifrag_kernel) {
2145  VLOG(1) << "Creating multifrag execution kernels";
2146  VLOG(1) << query_mem_desc.toString();
2147 
2148  // NB: We should never be on this path when the query is retried because of running
2149  // out of group by slots; also, for scan only queries on CPU we want the
2150  // high-granularity, fragment by fragment execution instead. For scan only queries on
2151  // GPU, we want the multifrag kernel path to save the overhead of allocating an output
2152  // buffer per fragment.
2153  auto multifrag_kernel_dispatch = [&ra_exe_unit,
2154  &execution_kernels,
2155  &column_fetcher,
2156  &eo,
2157  &query_comp_desc,
2158  &query_mem_desc,
2159  render_info](const int device_id,
2160  const FragmentsList& frag_list,
2161  const int64_t rowid_lookup_key) {
2162  execution_kernels.emplace_back(
2163  std::make_unique<ExecutionKernel>(ra_exe_unit,
2165  device_id,
2166  eo,
2167  column_fetcher,
2168  query_comp_desc,
2169  query_mem_desc,
2170  frag_list,
2172  render_info,
2173  rowid_lookup_key));
2174  };
2175  fragment_descriptor.assignFragsToMultiDispatch(multifrag_kernel_dispatch);
2176  } else {
2177  VLOG(1) << "Creating one execution kernel per fragment";
2178  VLOG(1) << query_mem_desc.toString();
2179 
2180  if (!ra_exe_unit.use_bump_allocator && allow_single_frag_table_opt &&
2181  (query_mem_desc.getQueryDescriptionType() == QueryDescriptionType::Projection) &&
2182  table_infos.size() == 1 && table_infos.front().table_id > 0) {
2183  const auto max_frag_size =
2184  table_infos.front().info.getFragmentNumTuplesUpperBound();
2185  if (max_frag_size < query_mem_desc.getEntryCount()) {
2186  LOG(INFO) << "Lowering scan limit from " << query_mem_desc.getEntryCount()
2187  << " to match max fragment size " << max_frag_size
2188  << " for kernel per fragment execution path.";
2189  throw CompilationRetryNewScanLimit(max_frag_size);
2190  }
2191  }
2192 
2193  size_t frag_list_idx{0};
2194  auto fragment_per_kernel_dispatch = [&ra_exe_unit,
2195  &execution_kernels,
2196  &column_fetcher,
2197  &eo,
2198  &frag_list_idx,
2199  &device_type,
2200  &query_comp_desc,
2201  &query_mem_desc,
2202  render_info](const int device_id,
2203  const FragmentsList& frag_list,
2204  const int64_t rowid_lookup_key) {
2205  if (!frag_list.size()) {
2206  return;
2207  }
2208  CHECK_GE(device_id, 0);
2209 
2210  execution_kernels.emplace_back(
2211  std::make_unique<ExecutionKernel>(ra_exe_unit,
2212  device_type,
2213  device_id,
2214  eo,
2215  column_fetcher,
2216  query_comp_desc,
2217  query_mem_desc,
2218  frag_list,
2220  render_info,
2221  rowid_lookup_key));
2222  ++frag_list_idx;
2223  };
2224 
2225  fragment_descriptor.assignFragsToKernelDispatch(fragment_per_kernel_dispatch,
2226  ra_exe_unit);
2227  }
2228 
2229  return execution_kernels;
2230 }
bool is_agg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1079
ExecutorDeviceType getDeviceType() const
const std::vector< uint64_t > & getFragOffsets()
std::string toString() const
#define LOG(tag)
Definition: Logger.h:200
std::vector< size_t > outer_fragment_indices
std::vector< ColumnLazyFetchInfo > getColLazyFetchInfo(const std::vector< Analyzer::Expr * > &target_exprs) const
Definition: Execute.cpp:343
std::vector< InputDescriptor > input_descs
#define CHECK_GE(x, y)
Definition: Logger.h:219
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:652
#define CHECK_GT(x, y)
Definition: Logger.h:218
std::vector< FragmentsPerTable > FragmentsList
bool g_inner_join_fragment_skipping
Definition: Execute.cpp:84
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1078
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1049
void checkWorkUnitWatchdog(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const Catalog_Namespace::Catalog &cat, const ExecutorDeviceType device_type, const int device_count)
Definition: Execute.cpp:1115
std::vector< MemoryInfo > getMemoryInfo(const MemoryLevel memLevel)
Definition: DataMgr.cpp:303
#define CHECK(condition)
Definition: Logger.h:206
double gpu_input_mem_limit_percent
bool has_lazy_fetched_columns(const std::vector< ColumnLazyFetchInfo > &fetched_cols)
Definition: Execute.cpp:2087
#define VLOG(n)
Definition: Logger.h:300

+ Here is the call graph for this function:

CudaMgr_Namespace::CudaMgr* Executor::cudaMgr ( ) const
inlineprivate

Definition at line 530 of file Execute.h.

References CHECK, data_mgr_, and Data_Namespace::DataMgr::getCudaMgr().

Referenced by isArchPascalOrLater().

530  {
531  CHECK(data_mgr_);
532  auto cuda_mgr = data_mgr_->getCudaMgr();
533  CHECK(cuda_mgr);
534  return cuda_mgr;
535  }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:208
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1079
#define CHECK(condition)
Definition: Logger.h:206

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int Executor::deviceCount ( const ExecutorDeviceType  device_type) const
private

Definition at line 652 of file Execute.cpp.

References GPU.

652  {
653  if (device_type == ExecutorDeviceType::GPU) {
654  return cudaMgr()->getDeviceCount();
655  } else {
656  return 1;
657  }
658 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:530
int getDeviceCount() const
Definition: CudaMgr.h:86
int Executor::deviceCountForMemoryLevel ( const Data_Namespace::MemoryLevel  memory_level) const
private

Definition at line 660 of file Execute.cpp.

References CPU, GPU, and Data_Namespace::GPU_LEVEL.

661  {
662  return memory_level == GPU_LEVEL ? deviceCount(ExecutorDeviceType::GPU)
663  : deviceCount(ExecutorDeviceType::CPU);
664 }
ExecutorDeviceType
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:652
int64_t Executor::deviceCycles ( int  milliseconds) const
private

Definition at line 3407 of file Execute.cpp.

3407  {
3408  const auto& dev_props = cudaMgr()->getAllDeviceProperties();
3409  return static_cast<int64_t>(dev_props.front().clockKhz) * milliseconds;
3410 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:530
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:120
void Executor::enableRuntimeQueryInterrupt ( const double  runtime_query_check_freq,
const unsigned  pending_query_check_freq 
) const

Definition at line 4218 of file Execute.cpp.

References g_enable_runtime_query_interrupt, g_pending_query_interrupt_freq, and g_running_query_interrupt_freq.

4220  {
4221  // The only one scenario that we intentionally call this function is
4222  // to allow runtime query interrupt in QueryRunner for test cases.
4223  // Because test machine's default setting does not allow runtime query interrupt,
4224  // so we have to turn it on within test code if necessary.
4226  g_pending_query_interrupt_freq = pending_query_check_freq;
4227  g_running_query_interrupt_freq = runtime_query_check_freq;
4230  }
4231 }
double g_running_query_interrupt_freq
Definition: Execute.cpp:118
unsigned g_pending_query_interrupt_freq
Definition: Execute.cpp:117
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:114
void Executor::enrollQuerySession ( const QuerySessionId query_session,
const std::string &  query_str,
const std::string &  submitted_time_str,
const size_t  executor_id,
const QuerySessionStatus::QueryStatus  query_session_status 
)

Definition at line 4025 of file Execute.cpp.

References executor_id_().

4030  {
4031  // enroll the query session into the Executor's session map
4032  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
4033  if (query_session.empty()) {
4034  return;
4035  }
4036 
4037  addToQuerySessionList(query_session,
4038  query_str,
4039  submitted_time_str,
4040  executor_id,
4041  query_session_status,
4042  session_write_lock);
4043 
4044  if (query_session_status == QuerySessionStatus::QueryStatus::RUNNING) {
4045  current_query_session_ = query_session;
4047  }
4048 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1094
const ExecutorId executor_id_
Definition: Execute.h:1077
bool addToQuerySessionList(const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:4050
static QuerySessionId current_query_session_
Definition: Execute.h:1096
static size_t running_query_executor_id_
Definition: Execute.h:1098

+ Here is the call graph for this function:

ResultSetPtr Executor::executeExplain ( const QueryCompilationDescriptor query_comp_desc)
private

Definition at line 1722 of file Execute.cpp.

References QueryCompilationDescriptor::getIR().

1722  {
1723  return std::make_shared<ResultSet>(query_comp_desc.getIR());
1724 }

+ Here is the call graph for this function:

int32_t Executor::executePlanWithGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< size_t >  outer_tab_frag_ids,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const int  outer_table_id,
const int64_t  limit,
const uint32_t  start_rowid,
const uint32_t  num_tables,
const bool  allow_runtime_interrupt,
RenderInfo render_info 
)
private

Definition at line 3103 of file Execute.cpp.

References CHECK, CHECK_NE, anonymous_namespace{Execute.cpp}::check_rows_less_than_needed(), CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, logger::FATAL, g_enable_dynamic_watchdog, CompilationResult::generated_code, QueryMemoryDescriptor::getEntryCount(), QueryExecutionContext::getRowSet(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, RelAlgExecutionUnit::groupby_exprs, INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, shared::printContainer(), QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, RenderInfo::render_allocator_map_ptr, RelAlgExecutionUnit::scan_limit, QueryMemoryDescriptor::setEntryCount(), RelAlgExecutionUnit::union_all, RenderInfo::useCudaBuffers(), and VLOG.

3121  {
3122  auto timer = DEBUG_TIMER(__func__);
3124  CHECK(!results);
3125  if (col_buffers.empty()) {
3126  return 0;
3127  }
3128  CHECK_NE(ra_exe_unit.groupby_exprs.size(), size_t(0));
3129  // TODO(alex):
3130  // 1. Optimize size (make keys more compact).
3131  // 2. Resize on overflow.
3132  // 3. Optimize runtime.
3133  auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
3134  int32_t error_code = device_type == ExecutorDeviceType::GPU ? 0 : start_rowid;
3135  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
3136  if (allow_runtime_interrupt) {
3137  bool isInterrupted = false;
3138  {
3139  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
3140  const auto query_session = getCurrentQuerySession(session_read_lock);
3141  isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
3142  }
3143  if (isInterrupted) {
3145  }
3146  }
3147  if (g_enable_dynamic_watchdog && interrupted_.load()) {
3148  return ERR_INTERRUPTED;
3149  }
3150 
3151  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
3152  if (render_info && render_info->useCudaBuffers()) {
3153  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
3154  }
3155 
3156  VLOG(2) << "bool(ra_exe_unit.union_all)=" << bool(ra_exe_unit.union_all)
3157  << " ra_exe_unit.input_descs="
3158  << shared::printContainer(ra_exe_unit.input_descs)
3159  << " ra_exe_unit.input_col_descs="
3160  << shared::printContainer(ra_exe_unit.input_col_descs)
3161  << " ra_exe_unit.scan_limit=" << ra_exe_unit.scan_limit
3162  << " num_rows=" << shared::printContainer(num_rows)
3163  << " frag_offsets=" << shared::printContainer(frag_offsets)
3164  << " query_exe_context->query_buffers_->num_rows_="
3165  << query_exe_context->query_buffers_->num_rows_
3166  << " query_exe_context->query_mem_desc_.getEntryCount()="
3167  << query_exe_context->query_mem_desc_.getEntryCount()
3168  << " device_id=" << device_id << " outer_table_id=" << outer_table_id
3169  << " scan_limit=" << scan_limit << " start_rowid=" << start_rowid
3170  << " num_tables=" << num_tables;
3171 
3172  RelAlgExecutionUnit ra_exe_unit_copy = ra_exe_unit;
3173  // For UNION ALL, filter out input_descs and input_col_descs that are not associated
3174  // with outer_table_id.
3175  if (ra_exe_unit_copy.union_all) {
3176  // Sort outer_table_id first, then pop the rest off of ra_exe_unit_copy.input_descs.
3177  std::stable_sort(ra_exe_unit_copy.input_descs.begin(),
3178  ra_exe_unit_copy.input_descs.end(),
3179  [outer_table_id](auto const& a, auto const& b) {
3180  return a.getTableId() == outer_table_id &&
3181  b.getTableId() != outer_table_id;
3182  });
3183  while (!ra_exe_unit_copy.input_descs.empty() &&
3184  ra_exe_unit_copy.input_descs.back().getTableId() != outer_table_id) {
3185  ra_exe_unit_copy.input_descs.pop_back();
3186  }
3187  // Filter ra_exe_unit_copy.input_col_descs.
3188  ra_exe_unit_copy.input_col_descs.remove_if(
3189  [outer_table_id](auto const& input_col_desc) {
3190  return input_col_desc->getScanDesc().getTableId() != outer_table_id;
3191  });
3192  query_exe_context->query_mem_desc_.setEntryCount(ra_exe_unit_copy.scan_limit);
3193  }
3194 
3195  if (device_type == ExecutorDeviceType::CPU) {
3196  const int32_t scan_limit_for_query =
3197  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
3198  const int32_t max_matched = scan_limit_for_query == 0
3199  ? query_exe_context->query_mem_desc_.getEntryCount()
3200  : scan_limit_for_query;
3201 
3202  auto cpu_generated_code = std::dynamic_pointer_cast<CpuCompilationContext>(
3203  compilation_result.generated_code);
3204  CHECK(cpu_generated_code);
3205  query_exe_context->launchCpuCode(ra_exe_unit_copy,
3206  cpu_generated_code.get(),
3207  hoist_literals,
3208  hoist_buf,
3209  col_buffers,
3210  num_rows,
3211  frag_offsets,
3212  max_matched,
3213  &error_code,
3214  num_tables,
3215  join_hash_table_ptrs);
3216  } else {
3217  try {
3218  auto gpu_generated_code = std::dynamic_pointer_cast<GpuCompilationContext>(
3219  compilation_result.generated_code);
3220  CHECK(gpu_generated_code);
3221  query_exe_context->launchGpuCode(
3222  ra_exe_unit_copy,
3223  gpu_generated_code.get(),
3224  hoist_literals,
3225  hoist_buf,
3226  col_buffers,
3227  num_rows,
3228  frag_offsets,
3229  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit,
3230  data_mgr,
3231  blockSize(),
3232  gridSize(),
3233  device_id,
3234  compilation_result.gpu_smem_context.getSharedMemorySize(),
3235  &error_code,
3236  num_tables,
3237  allow_runtime_interrupt,
3238  join_hash_table_ptrs,
3239  render_allocator_map_ptr);
3240  } catch (const OutOfMemory&) {
3241  return ERR_OUT_OF_GPU_MEM;
3242  } catch (const OutOfRenderMemory&) {
3243  return ERR_OUT_OF_RENDER_MEM;
3244  } catch (const StreamingTopNNotSupportedInRenderQuery&) {
3246  } catch (const std::exception& e) {
3247  LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
3248  }
3249  }
3250 
3251  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
3252  error_code == Executor::ERR_DIV_BY_ZERO ||
3253  error_code == Executor::ERR_OUT_OF_TIME ||
3254  error_code == Executor::ERR_INTERRUPTED ||
3256  error_code == Executor::ERR_GEOS) {
3257  return error_code;
3258  }
3259 
3260  if (error_code != Executor::ERR_OVERFLOW_OR_UNDERFLOW &&
3261  error_code != Executor::ERR_DIV_BY_ZERO && !render_allocator_map_ptr) {
3262  results = query_exe_context->getRowSet(ra_exe_unit_copy,
3263  query_exe_context->query_mem_desc_);
3264  CHECK(results);
3265  VLOG(2) << "results->rowCount()=" << results->rowCount();
3266  results->holdLiterals(hoist_buf);
3267  }
3268  if (error_code < 0 && render_allocator_map_ptr) {
3269  auto const adjusted_scan_limit =
3270  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
3271  // More rows passed the filter than available slots. We don't have a count to check,
3272  // so assume we met the limit if a scan limit is set
3273  if (adjusted_scan_limit != 0) {
3274  return 0;
3275  } else {
3276  return error_code;
3277  }
3278  }
3279  if (error_code && (!scan_limit || check_rows_less_than_needed(results, scan_limit))) {
3280  return error_code; // unlucky, not enough results and we ran out of slots
3281  }
3282 
3283  return 0;
3284 }
QuerySessionId & getCurrentQuerySession(mapd_shared_lock< mapd_shared_mutex > &read_lock)
Definition: Execute.cpp:3885
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1094
int32_t executePlanWithGroupBy(const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const int outer_table_id, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info)
Definition: Execute.cpp:3103
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1142
void setEntryCount(const size_t val)
GpuSharedMemoryContext gpu_smem_context
const std::optional< bool > union_all
#define LOG(tag)
Definition: Logger.h:200
size_t getSharedMemorySize() const
std::vector< InputDescriptor > input_descs
static const int32_t ERR_GEOS
Definition: Execute.h:1148
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:76
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool checkIsQuerySessionInterrupted(const std::string &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
Definition: Execute.cpp:4193
std::unique_ptr< QueryMemoryInitializer > query_buffers_
std::vector< int64_t > getJoinHashTablePtrs(const ExecutorDeviceType device_type, const int device_id)
Definition: Execute.cpp:3286
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY
Definition: Execute.h:1146
static const int32_t ERR_DIV_BY_ZERO
Definition: Execute.h:1134
ResultSetPtr getRowSet(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc) const
#define INJECT_TIMER(DESC)
Definition: measure.h:93
#define CHECK_NE(x, y)
Definition: Logger.h:215
static const int32_t ERR_OUT_OF_RENDER_MEM
Definition: Execute.h:1138
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:1140
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1141
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:1147
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:1135
std::shared_ptr< CompilationContext > generated_code
QueryMemoryDescriptor query_mem_desc_
std::vector< int8_t > serializeLiterals(const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
Definition: Execute.cpp:390
unsigned gridSize() const
Definition: Execute.cpp:3379
std::unordered_map< int, CgenState::LiteralValues > literal_values
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
Definition: RenderInfo.h:32
bool check_rows_less_than_needed(const ResultSetPtr &results, const size_t scan_limit)
Definition: Execute.cpp:3096
static std::atomic< bool > interrupted_
Definition: Execute.h:1058
#define CHECK(condition)
Definition: Logger.h:206
#define DEBUG_TIMER(name)
Definition: Logger.h:322
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables)
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:96
std::vector< int64_t * > launchGpuCode(const RelAlgExecutionUnit &ra_exe_unit, const GpuCompilationContext *cu_functions, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const bool allow_runtime_interrupt, const std::vector< int64_t > &join_hash_tables, RenderAllocatorMap *render_allocator_map)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
unsigned blockSize() const
Definition: Execute.cpp:3393
#define VLOG(n)
Definition: Logger.h:300

+ Here is the call graph for this function:

int32_t Executor::executePlanWithoutGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const std::vector< Analyzer::Expr * > &  target_exprs,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t * >> &  col_buffers,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const uint32_t  start_rowid,
const uint32_t  num_tables,
const bool  allow_runtime_interrupt,
RenderInfo render_info 
)
private

Definition at line 2890 of file Execute.cpp.

References CHECK, CHECK_EQ, CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, RelAlgExecutionUnit::estimator, QueryExecutionContext::estimator_result_set_, logger::FATAL, g_bigint_count, g_enable_dynamic_watchdog, CompilationResult::generated_code, get_target_info(), QueryExecutionContext::getAggInitValForIndex(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, i, INJECT_TIMER, is_distinct_target(), RenderInfo::isPotentialInSituRender(), GpuSharedMemoryContext::isSharedMemoryUsed(), kAPPROX_COUNT_DISTINCT, kAPPROX_QUANTILE, kAVG, kCOUNT, kSAMPLE, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, reduceResults(), RenderInfo::render_allocator_map_ptr, takes_float_argument(), and RenderInfo::useCudaBuffers().

2906  {
2908  auto timer = DEBUG_TIMER(__func__);
2909  CHECK(!results);
2910  if (col_buffers.empty()) {
2911  return 0;
2912  }
2913 
2914  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
2915  if (render_info) {
2916  // TODO(adb): make sure that we either never get here in the CPU case, or if we do get
2917  // here, we are in non-insitu mode.
2918  CHECK(render_info->useCudaBuffers() || !render_info->isPotentialInSituRender())
2919  << "CUDA disabled rendering in the executePlanWithoutGroupBy query path is "
2920  "currently unsupported.";
2921  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
2922  }
2923 
2924  int32_t error_code = device_type == ExecutorDeviceType::GPU ? 0 : start_rowid;
2925  std::vector<int64_t*> out_vec;
2926  const auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
2927  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
2928  std::unique_ptr<OutVecOwner> output_memory_scope;
2929  if (allow_runtime_interrupt) {
2930  bool isInterrupted = false;
2931  {
2932  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
2933  const auto query_session = getCurrentQuerySession(session_read_lock);
2934  isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
2935  }
2936  if (isInterrupted) {
2938  }
2939  }
2940  if (g_enable_dynamic_watchdog && interrupted_.load()) {
2942  }
2943  if (device_type == ExecutorDeviceType::CPU) {
2944  auto cpu_generated_code = std::dynamic_pointer_cast<CpuCompilationContext>(
2945  compilation_result.generated_code);
2946  CHECK(cpu_generated_code);
2947  out_vec = query_exe_context->launchCpuCode(ra_exe_unit,
2948  cpu_generated_code.get(),
2949  hoist_literals,
2950  hoist_buf,
2951  col_buffers,
2952  num_rows,
2953  frag_offsets,
2954  0,
2955  &error_code,
2956  num_tables,
2957  join_hash_table_ptrs);
2958  output_memory_scope.reset(new OutVecOwner(out_vec));
2959  } else {
2960  auto gpu_generated_code = std::dynamic_pointer_cast<GpuCompilationContext>(
2961  compilation_result.generated_code);
2962  CHECK(gpu_generated_code);
2963  try {
2964  out_vec = query_exe_context->launchGpuCode(
2965  ra_exe_unit,
2966  gpu_generated_code.get(),
2967  hoist_literals,
2968  hoist_buf,
2969  col_buffers,
2970  num_rows,
2971  frag_offsets,
2972  0,
2973  data_mgr,
2974  blockSize(),
2975  gridSize(),
2976  device_id,
2977  compilation_result.gpu_smem_context.