OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Executor Class Reference

#include <Execute.h>

+ Collaboration diagram for Executor:

Classes

struct  ExecutorMutexHolder
 
class  FetchCacheAnchor
 
struct  GroupColLLVMValue
 
struct  JoinHashTableOrError
 

Public Types

using ExecutorId = size_t
 
using CachedCardinality = std::pair< bool, size_t >
 

Public Member Functions

 Executor (const ExecutorId id, Data_Namespace::DataMgr *data_mgr, const size_t block_size_x, const size_t grid_size_x, const size_t max_gpu_slab_size, const std::string &debug_dir, const std::string &debug_file)
 
const TemporaryTablesgetTemporaryTables ()
 
StringDictionaryProxygetStringDictionaryProxy (const int dict_id, const bool with_generation) const
 
StringDictionaryProxygetStringDictionaryProxy (const int dictId, const std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const
 
bool isCPUOnly () const
 
bool isArchMaxwell (const ExecutorDeviceType dt) const
 
bool containsLeftDeepOuterJoin () const
 
const ColumnDescriptorgetColumnDescriptor (const Analyzer::ColumnVar *) const
 
const ColumnDescriptorgetPhysicalColumnDescriptor (const Analyzer::ColumnVar *, int) const
 
const Catalog_Namespace::CataloggetCatalog () const
 
void setCatalog (const Catalog_Namespace::Catalog *catalog)
 
Data_Namespace::DataMgrgetDataMgr () const
 
const std::shared_ptr
< RowSetMemoryOwner
getRowSetMemoryOwner () const
 
const TemporaryTablesgetTemporaryTables () const
 
Fragmenter_Namespace::TableInfo getTableInfo (const int table_id) const
 
const TableGenerationgetTableGeneration (const int table_id) const
 
ExpressionRange getColRange (const PhysicalInput &) const
 
size_t getNumBytesForFetchedRow (const std::set< int > &table_ids_to_fetch) const
 
bool hasLazyFetchColumns (const std::vector< Analyzer::Expr * > &target_exprs) const
 
std::vector< ColumnLazyFetchInfogetColLazyFetchInfo (const std::vector< Analyzer::Expr * > &target_exprs) const
 
void registerActiveModule (void *module, const int device_id) const
 
void unregisterActiveModule (void *module, const int device_id) const
 
void interrupt (const QuerySessionId &query_session="", const QuerySessionId &interrupt_session="")
 
void resetInterrupt ()
 
void enableRuntimeQueryInterrupt (const double runtime_query_check_freq, const unsigned pending_query_check_freq) const
 
int8_t warpSize () const
 
unsigned gridSize () const
 
unsigned numBlocksPerMP () const
 
unsigned blockSize () const
 
size_t maxGpuSlabSize () const
 
ResultSetPtr executeWorkUnit (size_t &max_groups_buffer_entry_guess, const bool is_agg, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
TableUpdateMetadata executeUpdate (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const TableDescriptor *updated_table_desc, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const UpdateLogForFragment::Callback &cb, const bool is_agg)
 
void addTransientStringLiterals (const RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< RowSetMemoryOwner > &row_set_mem_owner)
 
void setupCaching (const std::unordered_set< PhysicalInput > &phys_inputs, const std::unordered_set< int > &phys_table_ids)
 
void setColRangeCache (const AggregatedColRange &aggregated_col_range)
 
ExecutorId getExecutorId () const
 
QuerySessionIdgetCurrentQuerySession (mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
QuerySessionStatus::QueryStatus getQuerySessionStatus (const QuerySessionId &candidate_query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
bool checkCurrentQuerySession (const std::string &candidate_query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
void invalidateRunningQuerySession (mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool addToQuerySessionList (const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool removeFromQuerySessionList (const QuerySessionId &query_session, const std::string &submitted_time_str, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
void setQuerySessionAsInterrupted (const QuerySessionId &query_session, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool checkIsQuerySessionInterrupted (const std::string &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
bool checkIsQuerySessionEnrolled (const QuerySessionId &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
bool updateQuerySessionStatusWithLock (const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus updated_query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool updateQuerySessionExecutorAssignment (const QuerySessionId &query_session, const std::string &submitted_time_str, const size_t executor_id, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
std::vector< QuerySessionStatusgetQuerySessionInfo (const QuerySessionId &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
mapd_shared_mutexgetSessionLock ()
 
CurrentQueryStatus attachExecutorToQuerySession (const QuerySessionId &query_session_id, const std::string &query_str, const std::string &query_submitted_time)
 
void checkPendingQueryStatus (const QuerySessionId &query_session)
 
void clearQuerySessionStatus (const QuerySessionId &query_session, const std::string &submitted_time_str)
 
void updateQuerySessionStatus (const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus new_query_status)
 
void enrollQuerySession (const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted_time_str, const size_t executor_id, const QuerySessionStatus::QueryStatus query_session_status)
 
const std::vector< size_t > getExecutorIdsRunningQuery (const QuerySessionId &interrupt_session) const
 
bool checkNonKernelTimeInterrupted () const
 
void addToCardinalityCache (const std::string &cache_key, const size_t cache_value)
 
CachedCardinality getCachedCardinality (const std::string &cache_key)
 
mapd_shared_mutexgetDataRecyclerLock ()
 
QueryPlanDagCachegetQueryPlanDagCache ()
 
JoinColumnsInfo getJoinColumnsInfo (const Analyzer::Expr *join_expr, JoinColumnSide target_side, bool extract_only_col_id)
 

Static Public Member Functions

static std::shared_ptr< ExecutorgetExecutor (const ExecutorId id, const std::string &debug_dir="", const std::string &debug_file="", const SystemParameters &system_parameters=SystemParameters())
 
static void nukeCacheOfExecutors ()
 
static void clearMemory (const Data_Namespace::MemoryLevel memory_level)
 
static size_t getArenaBlockSize ()
 
static void addUdfIrToModule (const std::string &udf_ir_filename, const bool is_cuda_ir)
 
static std::pair< int64_t,
int32_t > 
reduceResults (const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)
 
static void addCodeToCache (const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
 

Static Public Attributes

static const ExecutorId UNITARY_EXECUTOR_ID = 0
 
static const size_t high_scan_limit
 
static const int32_t ERR_DIV_BY_ZERO {1}
 
static const int32_t ERR_OUT_OF_GPU_MEM {2}
 
static const int32_t ERR_OUT_OF_SLOTS {3}
 
static const int32_t ERR_UNSUPPORTED_SELF_JOIN {4}
 
static const int32_t ERR_OUT_OF_RENDER_MEM {5}
 
static const int32_t ERR_OUT_OF_CPU_MEM {6}
 
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW {7}
 
static const int32_t ERR_OUT_OF_TIME {9}
 
static const int32_t ERR_INTERRUPTED {10}
 
static const int32_t ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED {11}
 
static const int32_t ERR_TOO_MANY_LITERALS {12}
 
static const int32_t ERR_STRING_CONST_IN_RESULTSET {13}
 
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY {14}
 
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES {15}
 
static const int32_t ERR_GEOS {16}
 
static const int32_t ERR_WIDTH_BUCKET_INVALID_ARGUMENT {17}
 
static std::mutex compilation_mutex_
 
static std::mutex kernel_mutex_
 

Private Types

using PerFragmentCallBack = std::function< void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo &)>
 

Private Member Functions

void clearMetaInfoCache ()
 
int deviceCount (const ExecutorDeviceType) const
 
int deviceCountForMemoryLevel (const Data_Namespace::MemoryLevel memory_level) const
 
llvm::Value * codegenWindowFunction (const size_t target_index, const CompilationOptions &co)
 
llvm::Value * codegenWindowFunctionAggregate (const CompilationOptions &co)
 
llvm::BasicBlock * codegenWindowResetStateControlFlow ()
 
void codegenWindowFunctionStateInit (llvm::Value *aggregate_state)
 
llvm::Value * codegenWindowFunctionAggregateCalls (llvm::Value *aggregate_state, const CompilationOptions &co)
 
void codegenWindowAvgEpilogue (llvm::Value *crt_val, llvm::Value *window_func_null_val, llvm::Value *multiplicity_lv)
 
llvm::Value * codegenAggregateWindowState ()
 
llvm::Value * aggregateWindowStatePtr ()
 
CudaMgr_Namespace::CudaMgrcudaMgr () const
 
bool isArchPascalOrLater (const ExecutorDeviceType dt) const
 
bool needFetchAllFragments (const InputColDescriptor &col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments) const
 
bool needLinearizeAllFragments (const ColumnDescriptor *cd, const InputColDescriptor &inner_col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments, const Data_Namespace::MemoryLevel memory_level) const
 
void executeWorkUnitPerFragment (const RelAlgExecutionUnit &ra_exe_unit, const InputTableInfo &table_info, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, PerFragmentCallBack &cb, const std::set< size_t > &fragment_indexes_param)
 Compiles and dispatches a work unit per fragment processing results with the per fragment callback. Currently used for computing metrics over fragments (metadata). More...
 
ResultSetPtr executeExplain (const QueryCompilationDescriptor &)
 
ResultSetPtr executeTableFunction (const TableFunctionExecutionUnit exe_unit, const std::vector< InputTableInfo > &table_infos, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat)
 Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps. More...
 
ExecutorDeviceType getDeviceTypeForTargets (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType requested_device_type)
 
ResultSetPtr collectAllDeviceResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
 
ResultSetPtr collectAllDeviceShardedTopResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit) const
 
std::unordered_map< int, const
Analyzer::BinOper * > 
getInnerTabIdToJoinCond () const
 
std::vector< std::unique_ptr
< ExecutionKernel > > 
createKernels (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, ColumnFetcher &column_fetcher, const std::vector< InputTableInfo > &table_infos, const ExecutionOptions &eo, const bool is_agg, const bool allow_single_frag_table_opt, const size_t context_count, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, RenderInfo *render_info, std::unordered_set< int > &available_gpus, int &available_cpus)
 
void launchKernels (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type)
 
std::vector< size_t > getTableFragmentIndices (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type, const size_t table_idx, const size_t outer_frag_idx, std::map< int, const TableFragments * > &selected_tables_fragments, const std::unordered_map< int, const Analyzer::BinOper * > &inner_table_id_to_join_condition)
 
bool skipFragmentPair (const Fragmenter_Namespace::FragmentInfo &outer_fragment_info, const Fragmenter_Namespace::FragmentInfo &inner_fragment_info, const int inner_table_id, const std::unordered_map< int, const Analyzer::BinOper * > &inner_table_id_to_join_condition, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
 
FetchResult fetchChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< int, const TableFragments * > &, const FragmentsList &selected_fragments, const Catalog_Namespace::Catalog &, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)
 
FetchResult fetchUnionChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< int, const TableFragments * > &, const FragmentsList &selected_fragments, const Catalog_Namespace::Catalog &, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)
 
std::pair< std::vector
< std::vector< int64_t >
>, std::vector< std::vector
< uint64_t > > > 
getRowCountAndOffsetForAllFrags (const RelAlgExecutionUnit &ra_exe_unit, const CartesianProduct< std::vector< std::vector< size_t >>> &frag_ids_crossjoin, const std::vector< InputDescriptor > &input_descs, const std::map< int, const TableFragments * > &all_tables_fragments)
 
void buildSelectedFragsMapping (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
void buildSelectedFragsMappingForUnion (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< size_t > getFragmentCount (const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
 
int32_t executePlanWithGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr *results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const int outer_table_id, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const int64_t rows_to_process=-1)
 
int32_t executePlanWithoutGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr *results, const std::vector< Analyzer::Expr * > &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, QueryExecutionContext *query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const int64_t rows_to_process=-1)
 
ResultSetPtr resultsUnion (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< int8_t * > getJoinHashTablePtrs (const ExecutorDeviceType device_type, const int device_id)
 
ResultSetPtr reduceMultiDeviceResults (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceMultiDeviceResultSets (std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceSpeculativeTopN (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr executeWorkUnitImpl (size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
std::vector< llvm::Value * > inlineHoistedLiterals ()
 
std::tuple< CompilationResult,
std::unique_ptr
< QueryMemoryDescriptor > > 
compileWorkUnit (const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
 
llvm::BasicBlock * codegenSkipDeletedOuterTableRow (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
std::vector< JoinLoopbuildJoinLoops (RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
 
JoinLoop::HoistedFiltersCallback buildHoistLeftHandSideFiltersCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const int inner_table_id, const CompilationOptions &co)
 
std::function< llvm::Value
*(const std::vector
< llvm::Value * >
&, llvm::Value *)> 
buildIsDeletedCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
 
std::shared_ptr< HashJoinbuildCurrentLevelHashTable (const JoinCondition &current_level_join_conditions, size_t level_idx, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
 
void redeclareFilterFunction ()
 
llvm::Value * addJoinLoopIterator (const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
 
void codegenJoinLoops (const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
 
bool compileBody (const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
 
void createErrorCheckControlFlow (llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
 
void insertErrorCodeChecker (llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
 
void preloadFragOffsets (const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
 
JoinHashTableOrError buildHashTableForQualifier (const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, ColumnCacheMap &column_cache, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
 
void nukeOldState (const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
 
std::shared_ptr
< CompilationContext
optimizeAndCodegenCPU (llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
 
std::shared_ptr
< CompilationContext
optimizeAndCodegenGPU (llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
 
std::string generatePTX (const std::string &) const
 
void initializeNVPTXBackend () const
 
int64_t deviceCycles (int milliseconds) const
 
GroupColLLVMValue groupByColumnCodegen (Analyzer::Expr *group_by_col, const size_t col_width, const CompilationOptions &, const bool translate_null_val, const int64_t translated_null_val, DiamondCodegen &, std::stack< llvm::BasicBlock * > &, const bool thread_mem_shared)
 
llvm::Value * castToFP (llvm::Value *, SQLTypeInfo const &from_ti, SQLTypeInfo const &to_ti)
 
llvm::Value * castToIntPtrTyIn (llvm::Value *val, const size_t bit_width)
 
std::tuple
< RelAlgExecutionUnit,
PlanState::DeletedColumnsMap
addDeletedColumn (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
bool isFragmentFullyDeleted (const int table_id, const Fragmenter_Namespace::FragmentInfo &fragment)
 
std::pair< bool, int64_t > skipFragment (const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &frag_info, const std::list< std::shared_ptr< Analyzer::Expr >> &simple_quals, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
std::pair< bool, int64_t > skipFragmentInnerJoins (const InputDescriptor &table_desc, const RelAlgExecutionUnit &ra_exe_unit, const Fragmenter_Namespace::FragmentInfo &fragment, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
AggregatedColRange computeColRangesCache (const std::unordered_set< PhysicalInput > &phys_inputs)
 
StringDictionaryGenerations computeStringDictionaryGenerations (const std::unordered_set< PhysicalInput > &phys_inputs)
 
TableGenerations computeTableGenerations (std::unordered_set< int > phys_table_ids)
 
std::shared_ptr
< CompilationContext
getCodeFromCache (const CodeCacheKey &, const CodeCache &)
 
std::vector< int8_t > serializeLiterals (const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
 
llvm::Value * spillDoubleElement (llvm::Value *elem_val, llvm::Type *elem_ty)
 
ExecutorMutexHolder acquireExecuteMutex ()
 

Static Private Member Functions

static size_t align (const size_t off_in, const size_t alignment)
 

Private Attributes

std::unique_ptr< CgenStatecgen_state_
 
std::unique_ptr< PlanStateplan_state_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::mutex gpu_exec_mutex_ [max_gpu_count]
 
std::atomic< bool > interrupted_
 
std::mutex str_dict_mutex_
 
std::unique_ptr
< llvm::TargetMachine > 
nvptx_target_machine_
 
CodeCache cpu_code_cache_
 
CodeCache gpu_code_cache_
 
const unsigned block_size_x_
 
const unsigned grid_size_x_
 
const size_t max_gpu_slab_size_
 
const std::string debug_dir_
 
const std::string debug_file_
 
const ExecutorId executor_id_
 
const Catalog_Namespace::Catalogcatalog_
 
Data_Namespace::DataMgrdata_mgr_
 
const TemporaryTablestemporary_tables_
 
TableIdToNodeMap table_id_to_node_map_
 
int64_t kernel_queue_time_ms_ = 0
 
int64_t compilation_queue_time_ms_ = 0
 
std::unique_ptr
< WindowProjectNodeContext
window_project_node_context_owned_
 
WindowFunctionContextactive_window_function_ {nullptr}
 
InputTableInfoCache input_table_info_cache_
 
AggregatedColRange agg_col_range_cache_
 
TableGenerations table_generations_
 
QuerySessionId current_query_session_
 
const QueryPlanHash INVALID_QUERY_PLAN_HASH {std::hash<std::string>{}(EMPTY_QUERY_PLAN)}
 

Static Private Attributes

static const int max_gpu_count {16}
 
static std::mutex gpu_active_modules_mutex_
 
static uint32_t gpu_active_modules_device_mask_ {0x0}
 
static void * gpu_active_modules_ [max_gpu_count]
 
static const size_t baseline_threshold
 
static const size_t code_cache_size {1000}
 
static mapd_shared_mutex executor_session_mutex_
 
static InterruptFlagMap queries_interrupt_flag_
 
static QuerySessionMap queries_session_map_
 
static std::map< int,
std::shared_ptr< Executor > > 
executors_
 
static mapd_shared_mutex execute_mutex_
 
static mapd_shared_mutex executors_cache_mutex_
 
static QueryPlanDagCache query_plan_dag_cache_
 
static mapd_shared_mutex recycler_mutex_
 
static std::unordered_map
< std::string, size_t > 
cardinality_cache_
 

Friends

class BaselineJoinHashTable
 
class CodeGenerator
 
class ColumnFetcher
 
struct DiamondCodegen
 
class ExecutionKernel
 
class KernelSubtask
 
class HashJoin
 
class OverlapsJoinHashTable
 
class RangeJoinHashTable
 
class GroupByAndAggregate
 
class QueryCompilationDescriptor
 
class QueryMemoryDescriptor
 
class QueryMemoryInitializer
 
class QueryFragmentDescriptor
 
class QueryExecutionContext
 
class ResultSet
 
class InValuesBitmap
 
class LeafAggregator
 
class PerfectJoinHashTable
 
class QueryRewriter
 
class PendingExecutionClosure
 
class RelAlgExecutor
 
class TableOptimizer
 
class TableFunctionCompilationContext
 
class TableFunctionExecutionContext
 
struct TargetExprCodegenBuilder
 
struct TargetExprCodegen
 
class WindowProjectNodeContext
 

Detailed Description

Definition at line 376 of file Execute.h.

Member Typedef Documentation

using Executor::CachedCardinality = std::pair<bool, size_t>

Definition at line 1031 of file Execute.h.

using Executor::ExecutorId = size_t

Definition at line 383 of file Execute.h.

Definition at line 573 of file Execute.h.

Constructor & Destructor Documentation

Executor::Executor ( const ExecutorId  id,
Data_Namespace::DataMgr data_mgr,
const size_t  block_size_x,
const size_t  grid_size_x,
const size_t  max_gpu_slab_size,
const std::string &  debug_dir,
const std::string &  debug_file 
)

Definition at line 155 of file Execute.cpp.

162  : cgen_state_(new CgenState({}, false))
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057

Member Function Documentation

ExecutorMutexHolder Executor::acquireExecuteMutex ( )
inlineprivate

Definition at line 1136 of file Execute.h.

References execute_mutex_, executor_id_, Executor::ExecutorMutexHolder::shared_lock, Executor::ExecutorMutexHolder::unique_lock, and UNITARY_EXECUTOR_ID.

1136  {
1137  ExecutorMutexHolder ret;
1139  // Only one unitary executor can run at a time
1140  ret.unique_lock = mapd_unique_lock<mapd_shared_mutex>(execute_mutex_);
1141  } else {
1142  ret.shared_lock = mapd_shared_lock<mapd_shared_mutex>(execute_mutex_);
1143  }
1144  return ret;
1145  }
static mapd_shared_mutex execute_mutex_
Definition: Execute.h:1130
const ExecutorId executor_id_
Definition: Execute.h:1101
static const ExecutorId UNITARY_EXECUTOR_ID
Definition: Execute.h:384
void Executor::addCodeToCache ( const CodeCacheKey key,
std::shared_ptr< CompilationContext compilation_context,
llvm::Module *  module,
CodeCache cache 
)
static

Definition at line 412 of file NativeCodegen.cpp.

References LruCache< key_t, value_t, hash_t >::put().

Referenced by StubGenerator::generateStub().

415  {
416  cache.put(key,
417  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
418  std::move(compilation_context), std::move(module)));
419 }
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > Executor::addDeletedColumn ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 3546 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::add_deleted_col_to_map(), catalog_(), CHECK, CompilationOptions::filter_on_deleted_column, and TABLE.

3548  {
3549  if (!co.filter_on_deleted_column) {
3550  return std::make_tuple(ra_exe_unit, PlanState::DeletedColumnsMap{});
3551  }
3552  auto ra_exe_unit_with_deleted = ra_exe_unit;
3553  PlanState::DeletedColumnsMap deleted_cols_map;
3554  for (const auto& input_table : ra_exe_unit_with_deleted.input_descs) {
3555  if (input_table.getSourceType() != InputSourceType::TABLE) {
3556  continue;
3557  }
3558  const auto td = catalog_->getMetadataForTable(input_table.getTableId());
3559  CHECK(td);
3560  const auto deleted_cd = catalog_->getDeletedColumnIfRowsDeleted(td);
3561  if (!deleted_cd) {
3562  continue;
3563  }
3564  CHECK(deleted_cd->columnType.is_boolean());
3565  // check deleted column is not already present
3566  bool found = false;
3567  for (const auto& input_col : ra_exe_unit_with_deleted.input_col_descs) {
3568  if (input_col.get()->getColId() == deleted_cd->columnId &&
3569  input_col.get()->getScanDesc().getTableId() == deleted_cd->tableId &&
3570  input_col.get()->getScanDesc().getNestLevel() == input_table.getNestLevel()) {
3571  found = true;
3572  add_deleted_col_to_map(deleted_cols_map, deleted_cd);
3573  break;
3574  }
3575  }
3576  if (!found) {
3577  // add deleted column
3578  ra_exe_unit_with_deleted.input_col_descs.emplace_back(new InputColDescriptor(
3579  deleted_cd->columnId, deleted_cd->tableId, input_table.getNestLevel()));
3580  add_deleted_col_to_map(deleted_cols_map, deleted_cd);
3581  }
3582  }
3583  return std::make_tuple(ra_exe_unit_with_deleted, deleted_cols_map);
3584 }
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1102
const ColumnDescriptor * getDeletedColumnIfRowsDeleted(const TableDescriptor *td) const
Definition: Catalog.cpp:3183
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
#define CHECK(condition)
Definition: Logger.h:209
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
void add_deleted_col_to_map(PlanState::DeletedColumnsMap &deleted_cols_map, const ColumnDescriptor *deleted_cd)
Definition: Execute.cpp:3534

+ Here is the call graph for this function:

llvm::Value * Executor::addJoinLoopIterator ( const std::vector< llvm::Value * > &  prev_iters,
const size_t  level_idx 
)
private

Definition at line 1108 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, and CHECK.

1109  {
1111  // Iterators are added for loop-outer joins when the head of the loop is generated,
1112  // then once again when the body if generated. Allow this instead of special handling
1113  // of call sites.
1114  const auto it = cgen_state_->scan_idx_to_hash_pos_.find(level_idx);
1115  if (it != cgen_state_->scan_idx_to_hash_pos_.end()) {
1116  return it->second;
1117  }
1118  CHECK(!prev_iters.empty());
1119  llvm::Value* matching_row_index = prev_iters.back();
1120  const auto it_ok =
1121  cgen_state_->scan_idx_to_hash_pos_.emplace(level_idx, matching_row_index);
1122  CHECK(it_ok.second);
1123  return matching_row_index;
1124 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:209
void Executor::addToCardinalityCache ( const std::string &  cache_key,
const size_t  cache_value 
)

Definition at line 4242 of file Execute.cpp.

References g_use_estimator_result_cache, and VLOG.

4243  {
4245  mapd_unique_lock<mapd_shared_mutex> lock(recycler_mutex_);
4246  cardinality_cache_[cache_key] = cache_value;
4247  VLOG(1) << "Put estimated cardinality to the cache";
4248  }
4249 }
static std::unordered_map< std::string, size_t > cardinality_cache_
Definition: Execute.h:1152
bool g_use_estimator_result_cache
Definition: Execute.cpp:120
static mapd_shared_mutex recycler_mutex_
Definition: Execute.h:1151
#define VLOG(n)
Definition: Logger.h:303
bool Executor::addToQuerySessionList ( const QuerySessionId query_session,
const std::string &  query_str,
const std::string &  submitted,
const size_t  executor_id,
const QuerySessionStatus::QueryStatus  query_status,
mapd_unique_lock< mapd_shared_mutex > &  write_lock 
)

Definition at line 4074 of file Execute.cpp.

4079  {
4080  // an internal API that enrolls the query session into the Executor's session map
4081  if (queries_session_map_.count(query_session)) {
4082  if (queries_session_map_.at(query_session).count(submitted_time_str)) {
4083  queries_session_map_.at(query_session).erase(submitted_time_str);
4084  queries_session_map_.at(query_session)
4085  .emplace(submitted_time_str,
4086  QuerySessionStatus(query_session,
4087  executor_id,
4088  query_str,
4089  submitted_time_str,
4090  query_status));
4091  } else {
4092  queries_session_map_.at(query_session)
4093  .emplace(submitted_time_str,
4094  QuerySessionStatus(query_session,
4095  executor_id,
4096  query_str,
4097  submitted_time_str,
4098  query_status));
4099  }
4100  } else {
4101  std::map<std::string, QuerySessionStatus> executor_per_query_map;
4102  executor_per_query_map.emplace(
4103  submitted_time_str,
4105  query_session, executor_id, query_str, submitted_time_str, query_status));
4106  queries_session_map_.emplace(query_session, executor_per_query_map);
4107  }
4108  return queries_interrupt_flag_.emplace(query_session, false).second;
4109 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1125
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1123
void Executor::addTransientStringLiterals ( const RelAlgExecutionUnit ra_exe_unit,
const std::shared_ptr< RowSetMemoryOwner > &  row_set_mem_owner 
)

Definition at line 1739 of file Execute.cpp.

References CHECK, RelAlgExecutionUnit::groupby_exprs, kENCODING_DICT, kSAMPLE, kSINGLE_VALUE, RelAlgExecutionUnit::quals, RelAlgExecutionUnit::simple_quals, RelAlgExecutionUnit::target_exprs, and ScalarExprVisitor< T >::visit().

1741  {
1742  TransientDictIdVisitor dict_id_visitor;
1743 
1744  auto visit_expr =
1745  [this, &dict_id_visitor, &row_set_mem_owner](const Analyzer::Expr* expr) {
1746  if (!expr) {
1747  return;
1748  }
1749  const auto dict_id = dict_id_visitor.visit(expr);
1750  if (dict_id >= 0) {
1751  auto sdp = getStringDictionaryProxy(dict_id, row_set_mem_owner, true);
1752  CHECK(sdp);
1753  TransientStringLiteralsVisitor visitor(sdp);
1754  visitor.visit(expr);
1755  }
1756  };
1757 
1758  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
1759  visit_expr(group_expr.get());
1760  }
1761 
1762  for (const auto& group_expr : ra_exe_unit.quals) {
1763  visit_expr(group_expr.get());
1764  }
1765 
1766  for (const auto& group_expr : ra_exe_unit.simple_quals) {
1767  visit_expr(group_expr.get());
1768  }
1769 
1770  for (const auto target_expr : ra_exe_unit.target_exprs) {
1771  const auto& target_type = target_expr->get_type_info();
1772  if (target_type.is_string() && target_type.get_compression() != kENCODING_DICT) {
1773  continue;
1774  }
1775  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1776  if (agg_expr) {
1777  if (agg_expr->get_aggtype() == kSINGLE_VALUE ||
1778  agg_expr->get_aggtype() == kSAMPLE) {
1779  visit_expr(agg_expr->get_arg());
1780  }
1781  } else {
1782  visit_expr(target_expr);
1783  }
1784  }
1785 }
std::vector< Analyzer::Expr * > target_exprs
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
StringDictionaryProxy * getStringDictionaryProxy(const int dict_id, const bool with_generation) const
Definition: Execute.h:421
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:209
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

void Executor::addUdfIrToModule ( const std::string &  udf_ir_filename,
const bool  is_cuda_ir 
)
static

Definition at line 1807 of file NativeCodegen.cpp.

References shared::getExpOfTwo().

Referenced by DBHandler::initialize().

1808  {
1809  if (is_cuda_ir) {
1810  read_udf_gpu_module(udf_ir_filename);
1811  } else {
1812  read_udf_cpu_module(udf_ir_filename);
1813  }
1814 }
void read_udf_cpu_module(const std::string &udf_ir_filename)
void read_udf_gpu_module(const std::string &udf_ir_filename)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * Executor::aggregateWindowStatePtr ( )
private

Definition at line 124 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), and kFLOAT.

124  {
126  const auto window_func_context =
128  const auto window_func = window_func_context->getWindowFunction();
129  const auto arg_ti = get_adjusted_window_type_info(window_func);
130  llvm::Type* aggregate_state_type =
131  arg_ti.get_type() == kFLOAT
132  ? llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0)
133  : llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
134  const auto aggregate_state_i64 = cgen_state_->llInt(
135  reinterpret_cast<const int64_t>(window_func_context->aggregateState()));
136  return cgen_state_->ir_builder_.CreateIntToPtr(aggregate_state_i64,
137  aggregate_state_type);
138 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

static size_t Executor::align ( const size_t  off_in,
const size_t  alignment 
)
inlinestaticprivate

Definition at line 1049 of file Execute.h.

1049  {
1050  size_t off = off_in;
1051  if (off % alignment != 0) {
1052  off += (alignment - off % alignment);
1053  }
1054  return off;
1055  }
CurrentQueryStatus Executor::attachExecutorToQuerySession ( const QuerySessionId query_session_id,
const std::string &  query_str,
const std::string &  query_submitted_time 
)

Definition at line 3977 of file Execute.cpp.

References executor_id_().

3980  {
3981  if (!query_session_id.empty()) {
3982  // if session is valid, do update 1) the exact executor id and 2) query status
3983  mapd_unique_lock<mapd_shared_mutex> write_lock(executor_session_mutex_);
3985  query_session_id, query_submitted_time, executor_id_, write_lock);
3986  updateQuerySessionStatusWithLock(query_session_id,
3987  query_submitted_time,
3988  QuerySessionStatus::QueryStatus::PENDING_EXECUTOR,
3989  write_lock);
3990  }
3991  return {query_session_id, query_str};
3992 }
bool updateQuerySessionStatusWithLock(const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus updated_query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:4111
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1119
const ExecutorId executor_id_
Definition: Execute.h:1101
bool updateQuerySessionExecutorAssignment(const QuerySessionId &query_session, const std::string &submitted_time_str, const size_t executor_id, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:4137
mapd_unique_lock< mapd_shared_mutex > write_lock

+ Here is the call graph for this function:

unsigned Executor::blockSize ( ) const

Definition at line 3454 of file Execute.cpp.

References block_size_x_(), CHECK, and data_mgr_().

3454  {
3455  CHECK(data_mgr_);
3456  const auto cuda_mgr = data_mgr_->getCudaMgr();
3457  if (!cuda_mgr) {
3458  return 0;
3459  }
3460  const auto& dev_props = cuda_mgr->getAllDeviceProperties();
3461  return block_size_x_ ? block_size_x_ : dev_props.front().maxThreadsPerBlock;
3462 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:218
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1103
const unsigned block_size_x_
Definition: Execute.h:1095
#define CHECK(condition)
Definition: Logger.h:209
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:120

+ Here is the call graph for this function:

std::shared_ptr< HashJoin > Executor::buildCurrentLevelHashTable ( const JoinCondition current_level_join_conditions,
size_t  level_idx,
RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache,
std::vector< std::string > &  fail_reasons 
)
private

Definition at line 949 of file IRCodegen.cpp.

References anonymous_namespace{IRCodegen.cpp}::add_qualifier_to_execution_unit(), AUTOMATIC_IR_METADATA, anonymous_namespace{IRCodegen.cpp}::check_valid_join_qual(), Data_Namespace::CPU_LEVEL, CompilationOptions::device_type, Executor::JoinHashTableOrError::fail_reason, GPU, Data_Namespace::GPU_LEVEL, Executor::JoinHashTableOrError::hash_table, RelAlgExecutionUnit::hash_table_build_plan_dag, IS_EQUIVALENCE, LEFT, OneToOne, JoinCondition::quals, RelAlgExecutionUnit::query_hint, RelAlgExecutionUnit::table_id_to_node_map, JoinCondition::type, and VLOG.

956  {
958  std::shared_ptr<HashJoin> current_level_hash_table;
959  auto handleNonHashtableQual = [&ra_exe_unit, &level_idx, this](
960  JoinType join_type,
961  std::shared_ptr<Analyzer::Expr> qual) {
962  if (join_type == JoinType::LEFT) {
963  plan_state_->addNonHashtableQualForLeftJoin(level_idx, qual);
964  } else {
965  add_qualifier_to_execution_unit(ra_exe_unit, qual);
966  }
967  };
968  for (const auto& join_qual : current_level_join_conditions.quals) {
969  auto qual_bin_oper = std::dynamic_pointer_cast<Analyzer::BinOper>(join_qual);
970  if (current_level_hash_table || !qual_bin_oper ||
971  !IS_EQUIVALENCE(qual_bin_oper->get_optype())) {
972  handleNonHashtableQual(current_level_join_conditions.type, join_qual);
973  if (!current_level_hash_table) {
974  fail_reasons.emplace_back("No equijoin expression found");
975  }
976  continue;
977  }
978  check_valid_join_qual(qual_bin_oper);
979  JoinHashTableOrError hash_table_or_error;
980  if (!current_level_hash_table) {
981  hash_table_or_error = buildHashTableForQualifier(
982  qual_bin_oper,
983  query_infos,
986  current_level_join_conditions.type,
988  column_cache,
989  ra_exe_unit.hash_table_build_plan_dag,
990  ra_exe_unit.query_hint,
991  ra_exe_unit.table_id_to_node_map);
992  current_level_hash_table = hash_table_or_error.hash_table;
993  }
994  if (hash_table_or_error.hash_table) {
995  plan_state_->join_info_.join_hash_tables_.push_back(hash_table_or_error.hash_table);
996  plan_state_->join_info_.equi_join_tautologies_.push_back(qual_bin_oper);
997  } else {
998  fail_reasons.push_back(hash_table_or_error.fail_reason);
999  if (!current_level_hash_table) {
1000  VLOG(2) << "Building a hashtable based on a qual " << qual_bin_oper->toString()
1001  << " fails: " << hash_table_or_error.fail_reason;
1002  }
1003  handleNonHashtableQual(current_level_join_conditions.type, qual_bin_oper);
1004  }
1005  }
1006  return current_level_hash_table;
1007 }
JoinType
Definition: sqldefs.h:108
#define IS_EQUIVALENCE(X)
Definition: sqldefs.h:67
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
TableIdToNodeMap table_id_to_node_map
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
void add_qualifier_to_execution_unit(RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< Analyzer::Expr > &qual)
Definition: IRCodegen.cpp:496
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
JoinHashTableOrError buildHashTableForQualifier(const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, ColumnCacheMap &column_cache, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
Definition: Execute.cpp:3398
std::list< std::shared_ptr< Analyzer::Expr > > quals
RegisteredQueryHint query_hint
#define VLOG(n)
Definition: Logger.h:303
HashTableBuildDagMap hash_table_build_plan_dag
void check_valid_join_qual(std::shared_ptr< Analyzer::BinOper > &bin_oper)
Definition: IRCodegen.cpp:526

+ Here is the call graph for this function:

Executor::JoinHashTableOrError Executor::buildHashTableForQualifier ( const std::shared_ptr< Analyzer::BinOper > &  qual_bin_oper,
const std::vector< InputTableInfo > &  query_infos,
const MemoryLevel  memory_level,
const JoinType  join_type,
const HashType  preferred_hash_type,
ColumnCacheMap column_cache,
const HashTableBuildDagMap hashtable_build_dag_map,
const RegisteredQueryHint query_hint,
const TableIdToNodeMap table_id_to_node_map 
)
private

Definition at line 3398 of file Execute.cpp.

References g_enable_dynamic_watchdog, g_enable_overlaps_hashjoin, and HashJoin::getInstance().

3407  {
3408  if (!g_enable_overlaps_hashjoin && qual_bin_oper->is_overlaps_oper()) {
3409  return {nullptr, "Overlaps hash join disabled, attempting to fall back to loop join"};
3410  }
3411  if (g_enable_dynamic_watchdog && interrupted_.load()) {
3413  }
3414  try {
3415  auto tbl = HashJoin::getInstance(qual_bin_oper,
3416  query_infos,
3417  memory_level,
3418  join_type,
3419  preferred_hash_type,
3420  deviceCountForMemoryLevel(memory_level),
3421  column_cache,
3422  this,
3423  hashtable_build_dag_map,
3424  query_hint,
3425  table_id_to_node_map);
3426  return {tbl, ""};
3427  } catch (const HashJoinFail& e) {
3428  return {nullptr, e.what()};
3429  }
3430 }
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1163
std::atomic< bool > interrupted_
Definition: Execute.h:1082
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
bool g_enable_overlaps_hashjoin
Definition: Execute.cpp:98
int deviceCountForMemoryLevel(const Data_Namespace::MemoryLevel memory_level) const
Definition: Execute.cpp:681
static std::shared_ptr< HashJoin > getInstance(const std::shared_ptr< Analyzer::BinOper > qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
Make hash table from an in-flight SQL query&#39;s parse tree etc.
Definition: HashJoin.cpp:238

+ Here is the call graph for this function:

JoinLoop::HoistedFiltersCallback Executor::buildHoistLeftHandSideFiltersCb ( const RelAlgExecutionUnit ra_exe_unit,
const size_t  level_idx,
const int  inner_table_id,
const CompilationOptions co 
)
private

Definition at line 782 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CodeGenerator::codegen(), g_enable_left_join_filter_hoisting, RelAlgExecutionUnit::join_quals, LEFT, RelAlgExecutionUnit::quals, RelAlgExecutionUnit::simple_quals, CodeGenerator::toBool(), and VLOG.

786  {
788  return nullptr;
789  }
790 
791  const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
792  if (level_idx == 0 && current_level_join_conditions.type == JoinType::LEFT) {
793  const auto& condition = current_level_join_conditions.quals.front();
794  const auto bin_oper = dynamic_cast<const Analyzer::BinOper*>(condition.get());
795  CHECK(bin_oper) << condition->toString();
796  const auto rhs =
797  dynamic_cast<const Analyzer::ColumnVar*>(bin_oper->get_right_operand());
798  const auto lhs =
799  dynamic_cast<const Analyzer::ColumnVar*>(bin_oper->get_left_operand());
800  if (lhs && rhs && lhs->get_table_id() != rhs->get_table_id()) {
801  const Analyzer::ColumnVar* selected_lhs{nullptr};
802  // grab the left hand side column -- this is somewhat similar to normalize column
803  // pair, and a better solution may be to hoist that function out of the join
804  // framework and normalize columns at the top of build join loops
805  if (lhs->get_table_id() == inner_table_id) {
806  selected_lhs = rhs;
807  } else if (rhs->get_table_id() == inner_table_id) {
808  selected_lhs = lhs;
809  }
810  if (selected_lhs) {
811  std::list<std::shared_ptr<Analyzer::Expr>> hoisted_quals;
812  // get all LHS-only filters
813  auto should_hoist_qual = [&hoisted_quals](const auto& qual, const int table_id) {
814  CHECK(qual);
815 
816  ExprTableIdVisitor visitor;
817  const auto table_ids = visitor.visit(qual.get());
818  if (table_ids.size() == 1 && table_ids.find(table_id) != table_ids.end()) {
819  hoisted_quals.push_back(qual);
820  }
821  };
822  for (const auto& qual : ra_exe_unit.simple_quals) {
823  should_hoist_qual(qual, selected_lhs->get_table_id());
824  }
825  for (const auto& qual : ra_exe_unit.quals) {
826  should_hoist_qual(qual, selected_lhs->get_table_id());
827  }
828 
829  // build the filters callback and return it
830  if (!hoisted_quals.empty()) {
831  return [this, hoisted_quals, co](llvm::BasicBlock* true_bb,
832  llvm::BasicBlock* exit_bb,
833  const std::string& loop_name,
834  llvm::Function* parent_func,
835  CgenState* cgen_state) -> llvm::BasicBlock* {
836  // make sure we have quals to hoist
837  bool has_quals_to_hoist = false;
838  for (const auto& qual : hoisted_quals) {
839  // check to see if the filter was previously hoisted. if all filters were
840  // previously hoisted, this callback becomes a noop
841  if (plan_state_->hoisted_filters_.count(qual) == 0) {
842  has_quals_to_hoist = true;
843  break;
844  }
845  }
846 
847  if (!has_quals_to_hoist) {
848  return nullptr;
849  }
850 
851  AUTOMATIC_IR_METADATA(cgen_state);
852 
853  llvm::IRBuilder<>& builder = cgen_state->ir_builder_;
854  auto& context = builder.getContext();
855 
856  const auto filter_bb =
857  llvm::BasicBlock::Create(context,
858  "hoisted_left_join_filters_" + loop_name,
859  parent_func,
860  /*insert_before=*/true_bb);
861  builder.SetInsertPoint(filter_bb);
862 
863  llvm::Value* filter_lv = cgen_state_->llBool(true);
864  CodeGenerator code_generator(this);
866  for (const auto& qual : hoisted_quals) {
867  if (plan_state_->hoisted_filters_.insert(qual).second) {
868  // qual was inserted into the hoisted filters map, which means we have not
869  // seen this qual before. Generate filter.
870  VLOG(1) << "Generating code for hoisted left hand side qualifier "
871  << qual->toString();
872  auto cond = code_generator.toBool(
873  code_generator.codegen(qual.get(), true, co).front());
874  filter_lv = builder.CreateAnd(filter_lv, cond);
875  }
876  }
877  CHECK(filter_lv->getType()->isIntegerTy(1));
878 
879  builder.CreateCondBr(filter_lv, true_bb, exit_bb);
880  return filter_bb;
881  };
882  }
883  }
884  }
885  }
886  return nullptr;
887 }
bool g_enable_left_join_filter_hoisting
Definition: Execute.cpp:96
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:209
#define VLOG(n)
Definition: Logger.h:303
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> Executor::buildIsDeletedCb ( const RelAlgExecutionUnit ra_exe_unit,
const size_t  level_idx,
const CompilationOptions co 
)
private

Definition at line 890 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_LT, CodeGenerator::codegen(), CompilationOptions::filter_on_deleted_column, RelAlgExecutionUnit::input_descs, TABLE, and CodeGenerator::toBool().

892  {
894  if (!co.filter_on_deleted_column) {
895  return nullptr;
896  }
897  CHECK_LT(level_idx + 1, ra_exe_unit.input_descs.size());
898  const auto input_desc = ra_exe_unit.input_descs[level_idx + 1];
899  if (input_desc.getSourceType() != InputSourceType::TABLE) {
900  return nullptr;
901  }
902 
903  const auto deleted_cd = plan_state_->getDeletedColForTable(input_desc.getTableId());
904  if (!deleted_cd) {
905  return nullptr;
906  }
907  CHECK(deleted_cd->columnType.is_boolean());
908  const auto deleted_expr = makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
909  input_desc.getTableId(),
910  deleted_cd->columnId,
911  input_desc.getNestLevel());
912  return [this, deleted_expr, level_idx, &co](const std::vector<llvm::Value*>& prev_iters,
913  llvm::Value* have_more_inner_rows) {
914  const auto matching_row_index = addJoinLoopIterator(prev_iters, level_idx + 1);
915  // Avoid fetching the deleted column from a position which is not valid.
916  // An invalid position can be returned by a one to one hash lookup (negative)
917  // or at the end of iteration over a set of matching values.
918  llvm::Value* is_valid_it{nullptr};
919  if (have_more_inner_rows) {
920  is_valid_it = have_more_inner_rows;
921  } else {
922  is_valid_it = cgen_state_->ir_builder_.CreateICmp(
923  llvm::ICmpInst::ICMP_SGE, matching_row_index, cgen_state_->llInt<int64_t>(0));
924  }
925  const auto it_valid_bb = llvm::BasicBlock::Create(
926  cgen_state_->context_, "it_valid", cgen_state_->current_func_);
927  const auto it_not_valid_bb = llvm::BasicBlock::Create(
928  cgen_state_->context_, "it_not_valid", cgen_state_->current_func_);
929  cgen_state_->ir_builder_.CreateCondBr(is_valid_it, it_valid_bb, it_not_valid_bb);
930  const auto row_is_deleted_bb = llvm::BasicBlock::Create(
931  cgen_state_->context_, "row_is_deleted", cgen_state_->current_func_);
932  cgen_state_->ir_builder_.SetInsertPoint(it_valid_bb);
933  CodeGenerator code_generator(this);
934  const auto row_is_deleted = code_generator.toBool(
935  code_generator.codegen(deleted_expr.get(), true, co).front());
936  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
937  cgen_state_->ir_builder_.SetInsertPoint(it_not_valid_bb);
938  const auto row_is_deleted_default = cgen_state_->llBool(false);
939  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
940  cgen_state_->ir_builder_.SetInsertPoint(row_is_deleted_bb);
941  auto row_is_deleted_or_default =
942  cgen_state_->ir_builder_.CreatePHI(row_is_deleted->getType(), 2);
943  row_is_deleted_or_default->addIncoming(row_is_deleted, it_valid_bb);
944  row_is_deleted_or_default->addIncoming(row_is_deleted_default, it_not_valid_bb);
945  return row_is_deleted_or_default;
946  };
947 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:219
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:1108
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

std::vector< JoinLoop > Executor::buildJoinLoops ( RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache 
)
private

Definition at line 545 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_LT, CodeGenerator::codegen(), INJECT_TIMER, CgenState::ir_builder_, RelAlgExecutionUnit::join_quals, LEFT, PlanState::left_join_non_hashtable_quals_, CgenState::llBool(), MultiSet, OneToOne, CgenState::outer_join_match_found_per_level_, CodeGenerator::plan_state_, Set, Singleton, JoinLoopDomain::slot_lookup_result, CodeGenerator::toBool(), and JoinLoopDomain::values_buffer.

550  {
553  std::vector<JoinLoop> join_loops;
554  for (size_t level_idx = 0, current_hash_table_idx = 0;
555  level_idx < ra_exe_unit.join_quals.size();
556  ++level_idx) {
557  const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
558  std::vector<std::string> fail_reasons;
559  const auto current_level_hash_table =
560  buildCurrentLevelHashTable(current_level_join_conditions,
561  level_idx,
562  ra_exe_unit,
563  co,
564  query_infos,
565  column_cache,
566  fail_reasons);
567  const auto found_outer_join_matches_cb =
568  [this, level_idx](llvm::Value* found_outer_join_matches) {
569  CHECK_LT(level_idx, cgen_state_->outer_join_match_found_per_level_.size());
570  CHECK(!cgen_state_->outer_join_match_found_per_level_[level_idx]);
571  cgen_state_->outer_join_match_found_per_level_[level_idx] =
572  found_outer_join_matches;
573  };
574  const auto is_deleted_cb = buildIsDeletedCb(ra_exe_unit, level_idx, co);
575  auto rem_left_join_quals_it =
576  plan_state_->left_join_non_hashtable_quals_.find(level_idx);
577  bool has_remaining_left_join_quals =
578  rem_left_join_quals_it != plan_state_->left_join_non_hashtable_quals_.end() &&
579  !rem_left_join_quals_it->second.empty();
580  const auto outer_join_condition_remaining_quals_cb =
581  [this, level_idx, &co](const std::vector<llvm::Value*>& prev_iters) {
582  // when we have multiple quals for the left join in the current join level
583  // we first try to build a hashtable by using one of the possible qual,
584  // and deal with remaining quals as extra join conditions
585  FetchCacheAnchor anchor(cgen_state_.get());
586  addJoinLoopIterator(prev_iters, level_idx + 1);
587  llvm::Value* left_join_cond = cgen_state_->llBool(true);
588  CodeGenerator code_generator(this);
589  auto it = plan_state_->left_join_non_hashtable_quals_.find(level_idx);
590  if (it != plan_state_->left_join_non_hashtable_quals_.end()) {
591  for (auto expr : it->second) {
592  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
593  left_join_cond,
594  code_generator.toBool(
595  code_generator.codegen(expr.get(), true, co).front()));
596  }
597  }
598  return left_join_cond;
599  };
600  if (current_level_hash_table) {
601  const auto hoisted_filters_cb = buildHoistLeftHandSideFiltersCb(
602  ra_exe_unit, level_idx, current_level_hash_table->getInnerTableId(), co);
603  if (current_level_hash_table->getHashType() == HashType::OneToOne) {
604  join_loops.emplace_back(
605  /*kind=*/JoinLoopKind::Singleton,
606  /*type=*/current_level_join_conditions.type,
607  /*iteration_domain_codegen=*/
608  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
609  const std::vector<llvm::Value*>& prev_iters) {
610  addJoinLoopIterator(prev_iters, level_idx);
611  JoinLoopDomain domain{{0}};
612  domain.slot_lookup_result =
613  current_level_hash_table->codegenSlot(co, current_hash_table_idx);
614  return domain;
615  },
616  /*outer_condition_match=*/
617  current_level_join_conditions.type == JoinType::LEFT &&
618  has_remaining_left_join_quals
619  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
620  outer_join_condition_remaining_quals_cb)
621  : nullptr,
622  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
623  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
624  : nullptr,
625  /*hoisted_filters=*/hoisted_filters_cb,
626  /*is_deleted=*/is_deleted_cb);
627  } else if (auto range_join_table =
628  dynamic_cast<RangeJoinHashTable*>(current_level_hash_table.get())) {
629  join_loops.emplace_back(
630  /* kind= */ JoinLoopKind::MultiSet,
631  /* type= */ current_level_join_conditions.type,
632  /* iteration_domain_codegen= */
633  [this,
634  range_join_table,
635  current_hash_table_idx,
636  level_idx,
637  current_level_hash_table,
638  &co](const std::vector<llvm::Value*>& prev_iters) {
639  addJoinLoopIterator(prev_iters, level_idx);
640  JoinLoopDomain domain{{0}};
641  CHECK(!prev_iters.empty());
642  const auto matching_set = range_join_table->codegenMatchingSetWithOffset(
643  co, current_hash_table_idx, prev_iters.back());
644  domain.values_buffer = matching_set.elements;
645  domain.element_count = matching_set.count;
646  return domain;
647  },
648  /* outer_condition_match= */
649  current_level_join_conditions.type == JoinType::LEFT
650  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
651  outer_join_condition_remaining_quals_cb)
652  : nullptr,
653  /* found_outer_matches= */
654  current_level_join_conditions.type == JoinType::LEFT
655  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
656  : nullptr,
657  /* hoisted_filters= */ nullptr, // <<! TODO
658  /* is_deleted= */ is_deleted_cb);
659  } else {
660  join_loops.emplace_back(
661  /*kind=*/JoinLoopKind::Set,
662  /*type=*/current_level_join_conditions.type,
663  /*iteration_domain_codegen=*/
664  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
665  const std::vector<llvm::Value*>& prev_iters) {
666  addJoinLoopIterator(prev_iters, level_idx);
667  JoinLoopDomain domain{{0}};
668  const auto matching_set = current_level_hash_table->codegenMatchingSet(
669  co, current_hash_table_idx);
670  domain.values_buffer = matching_set.elements;
671  domain.element_count = matching_set.count;
672  return domain;
673  },
674  /*outer_condition_match=*/
675  current_level_join_conditions.type == JoinType::LEFT
676  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
677  outer_join_condition_remaining_quals_cb)
678  : nullptr,
679  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
680  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
681  : nullptr,
682  /*hoisted_filters=*/hoisted_filters_cb,
683  /*is_deleted=*/is_deleted_cb);
684  }
685  ++current_hash_table_idx;
686  } else {
687  const auto fail_reasons_str = current_level_join_conditions.quals.empty()
688  ? "No equijoin expression found"
689  : boost::algorithm::join(fail_reasons, " | ");
691  ra_exe_unit, eo, query_infos, level_idx, fail_reasons_str);
692  // Callback provided to the `JoinLoop` framework to evaluate the (outer) join
693  // condition.
694  VLOG(1) << "Unable to build hash table, falling back to loop join: "
695  << fail_reasons_str;
696  const auto outer_join_condition_cb =
697  [this, level_idx, &co, &current_level_join_conditions](
698  const std::vector<llvm::Value*>& prev_iters) {
699  // The values generated for the match path don't dominate all uses
700  // since on the non-match path nulls are generated. Reset the cache
701  // once the condition is generated to avoid incorrect reuse.
702  FetchCacheAnchor anchor(cgen_state_.get());
703  addJoinLoopIterator(prev_iters, level_idx + 1);
704  llvm::Value* left_join_cond = cgen_state_->llBool(true);
705  CodeGenerator code_generator(this);
706  for (auto expr : current_level_join_conditions.quals) {
707  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
708  left_join_cond,
709  code_generator.toBool(
710  code_generator.codegen(expr.get(), true, co).front()));
711  }
712  return left_join_cond;
713  };
714  join_loops.emplace_back(
715  /*kind=*/JoinLoopKind::UpperBound,
716  /*type=*/current_level_join_conditions.type,
717  /*iteration_domain_codegen=*/
718  [this, level_idx](const std::vector<llvm::Value*>& prev_iters) {
719  addJoinLoopIterator(prev_iters, level_idx);
720  JoinLoopDomain domain{{0}};
721  const auto rows_per_scan_ptr = cgen_state_->ir_builder_.CreateGEP(
722  get_arg_by_name(cgen_state_->row_func_, "num_rows_per_scan"),
723  cgen_state_->llInt(int32_t(level_idx + 1)));
724  domain.upper_bound = cgen_state_->ir_builder_.CreateLoad(rows_per_scan_ptr,
725  "num_rows_per_scan");
726  return domain;
727  },
728  /*outer_condition_match=*/
729  current_level_join_conditions.type == JoinType::LEFT
730  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
731  outer_join_condition_cb)
732  : nullptr,
733  /*found_outer_matches=*/
734  current_level_join_conditions.type == JoinType::LEFT
735  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
736  : nullptr,
737  /*hoisted_filters=*/nullptr,
738  /*is_deleted=*/is_deleted_cb);
739  }
740  }
741  return join_loops;
742 }
llvm::Value * values_buffer
Definition: JoinLoop.h:49
std::string join(T const &container, std::string const &delim)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:164
#define INJECT_TIMER(DESC)
Definition: measure.h:93
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * slot_lookup_result
Definition: JoinLoop.h:47
#define CHECK_LT(x, y)
Definition: Logger.h:219
std::shared_ptr< HashJoin > buildCurrentLevelHashTable(const JoinCondition &current_level_join_conditions, size_t level_idx, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
Definition: IRCodegen.cpp:949
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:1108
#define CHECK(condition)
Definition: Logger.h:209
void check_if_loop_join_is_allowed(RelAlgExecutionUnit &ra_exe_unit, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, const size_t level_idx, const std::string &fail_reason)
Definition: IRCodegen.cpp:506
JoinLoop::HoistedFiltersCallback buildHoistLeftHandSideFiltersCb(const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const int inner_table_id, const CompilationOptions &co)
Definition: IRCodegen.cpp:782
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:545
std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> buildIsDeletedCb(const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
Definition: IRCodegen.cpp:890
#define VLOG(n)
Definition: Logger.h:303

+ Here is the call graph for this function:

void Executor::buildSelectedFragsMapping ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 2851 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LT, and RelAlgExecutionUnit::input_descs.

2856  {
2857  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
2858  size_t frag_pos{0};
2859  const auto& input_descs = ra_exe_unit.input_descs;
2860  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
2861  const int table_id = input_descs[scan_idx].getTableId();
2862  CHECK_EQ(selected_fragments[scan_idx].table_id, table_id);
2863  selected_fragments_crossjoin.push_back(
2864  getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
2865  for (const auto& col_id : col_global_ids) {
2866  CHECK(col_id);
2867  const auto& input_desc = col_id->getScanDesc();
2868  if (input_desc.getTableId() != table_id ||
2869  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
2870  continue;
2871  }
2872  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
2873  CHECK(it != plan_state_->global_to_local_col_ids_.end());
2874  CHECK_LT(static_cast<size_t>(it->second),
2875  plan_state_->global_to_local_col_ids_.size());
2876  local_col_to_frag_pos[it->second] = frag_pos;
2877  }
2878  ++frag_pos;
2879  }
2880 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
#define CHECK_LT(x, y)
Definition: Logger.h:219
std::vector< size_t > getFragmentCount(const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:2837
#define CHECK(condition)
Definition: Logger.h:209
void Executor::buildSelectedFragsMappingForUnion ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 2882 of file Execute.cpp.

References CHECK, CHECK_LT, and RelAlgExecutionUnit::input_descs.

2887  {
2888  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
2889  size_t frag_pos{0};
2890  const auto& input_descs = ra_exe_unit.input_descs;
2891  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
2892  const int table_id = input_descs[scan_idx].getTableId();
2893  // selected_fragments here is from assignFragsToKernelDispatch
2894  // execution_kernel.fragments
2895  if (selected_fragments[0].table_id != table_id) { // TODO 0
2896  continue;
2897  }
2898  // CHECK_EQ(selected_fragments[scan_idx].table_id, table_id);
2899  selected_fragments_crossjoin.push_back(
2900  // getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
2901  {size_t(1)}); // TODO
2902  for (const auto& col_id : col_global_ids) {
2903  CHECK(col_id);
2904  const auto& input_desc = col_id->getScanDesc();
2905  if (input_desc.getTableId() != table_id ||
2906  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
2907  continue;
2908  }
2909  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
2910  CHECK(it != plan_state_->global_to_local_col_ids_.end());
2911  CHECK_LT(static_cast<size_t>(it->second),
2912  plan_state_->global_to_local_col_ids_.size());
2913  local_col_to_frag_pos[it->second] = frag_pos;
2914  }
2915  ++frag_pos;
2916  }
2917 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
#define CHECK_LT(x, y)
Definition: Logger.h:219
#define CHECK(condition)
Definition: Logger.h:209
llvm::Value * Executor::castToFP ( llvm::Value *  value,
SQLTypeInfo const from_ti,
SQLTypeInfo const to_ti 
)
private

Definition at line 3473 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, exp_to_scale(), logger::FATAL, SQLTypeInfo::get_scale(), SQLTypeInfo::get_size(), SQLTypeInfo::is_fp(), SQLTypeInfo::is_number(), and LOG.

3475  {
3477  if (value->getType()->isIntegerTy() && from_ti.is_number() && to_ti.is_fp() &&
3478  (!from_ti.is_fp() || from_ti.get_size() != to_ti.get_size())) {
3479  llvm::Type* fp_type{nullptr};
3480  switch (to_ti.get_size()) {
3481  case 4:
3482  fp_type = llvm::Type::getFloatTy(cgen_state_->context_);
3483  break;
3484  case 8:
3485  fp_type = llvm::Type::getDoubleTy(cgen_state_->context_);
3486  break;
3487  default:
3488  LOG(FATAL) << "Unsupported FP size: " << to_ti.get_size();
3489  }
3490  value = cgen_state_->ir_builder_.CreateSIToFP(value, fp_type);
3491  if (from_ti.get_scale()) {
3492  value = cgen_state_->ir_builder_.CreateFDiv(
3493  value,
3494  llvm::ConstantFP::get(value->getType(), exp_to_scale(from_ti.get_scale())));
3495  }
3496  }
3497  return value;
3498 }
#define LOG(tag)
Definition: Logger.h:203
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
#define AUTOMATIC_IR_METADATA(CGENSTATE)
uint64_t exp_to_scale(const unsigned exp)

+ Here is the call graph for this function:

llvm::Value * Executor::castToIntPtrTyIn ( llvm::Value *  val,
const size_t  bit_width 
)
private

Definition at line 3500 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_LT, and get_int_type().

3500  {
3502  CHECK(val->getType()->isPointerTy());
3503 
3504  const auto val_ptr_type = static_cast<llvm::PointerType*>(val->getType());
3505  const auto val_type = val_ptr_type->getElementType();
3506  size_t val_width = 0;
3507  if (val_type->isIntegerTy()) {
3508  val_width = val_type->getIntegerBitWidth();
3509  } else {
3510  if (val_type->isFloatTy()) {
3511  val_width = 32;
3512  } else {
3513  CHECK(val_type->isDoubleTy());
3514  val_width = 64;
3515  }
3516  }
3517  CHECK_LT(size_t(0), val_width);
3518  if (bitWidth == val_width) {
3519  return val;
3520  }
3521  return cgen_state_->ir_builder_.CreateBitCast(
3522  val, llvm::PointerType::get(get_int_type(bitWidth, cgen_state_->context_), 0));
3523 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:219
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

bool Executor::checkCurrentQuerySession ( const std::string &  candidate_query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 3951 of file Execute.cpp.

3952  {
3953  // if current_query_session is equal to the candidate_query_session,
3954  // or it is empty session we consider
3955  return !candidate_query_session.empty() &&
3956  (current_query_session_ == candidate_query_session);
3957 }
QuerySessionId current_query_session_
Definition: Execute.h:1121
bool Executor::checkIsQuerySessionEnrolled ( const QuerySessionId query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 4218 of file Execute.cpp.

4220  {
4221  if (query_session.empty()) {
4222  return false;
4223  }
4224  return !query_session.empty() && queries_session_map_.count(query_session);
4225 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1125
bool Executor::checkIsQuerySessionInterrupted ( const std::string &  query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 4207 of file Execute.cpp.

4209  {
4210  if (query_session.empty()) {
4211  return false;
4212  }
4213  auto flag_it = queries_interrupt_flag_.find(query_session);
4214  return !query_session.empty() && flag_it != queries_interrupt_flag_.end() &&
4215  flag_it->second;
4216 }
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1123
bool Executor::checkNonKernelTimeInterrupted ( ) const

Definition at line 4295 of file Execute.cpp.

References executor_id_().

4295  {
4296  // this function should be called within an executor which is assigned
4297  // to the specific query thread (that indicates we already enroll the session)
4298  // check whether this is called from non unitary executor
4300  return false;
4301  };
4302  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
4303  auto flag_it = queries_interrupt_flag_.find(current_query_session_);
4304  return !current_query_session_.empty() && flag_it != queries_interrupt_flag_.end() &&
4305  flag_it->second;
4306 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1119
QuerySessionId current_query_session_
Definition: Execute.h:1121
const ExecutorId executor_id_
Definition: Execute.h:1101
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1123
static const ExecutorId UNITARY_EXECUTOR_ID
Definition: Execute.h:384

+ Here is the call graph for this function:

void Executor::checkPendingQueryStatus ( const QuerySessionId query_session)

Definition at line 3994 of file Execute.cpp.

References ERR_INTERRUPTED, and VLOG.

3994  {
3995  // check whether we are okay to execute the "pending" query
3996  // i.e., before running the query check if this query session is "ALREADY" interrupted
3997  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
3998  if (query_session.empty()) {
3999  return;
4000  }
4001  if (queries_interrupt_flag_.find(query_session) == queries_interrupt_flag_.end()) {
4002  // something goes wrong since we assume this is caller's responsibility
4003  // (call this function only for enrolled query session)
4004  if (!queries_session_map_.count(query_session)) {
4005  VLOG(1) << "Interrupting pending query is not available since the query session is "
4006  "not enrolled";
4007  } else {
4008  // here the query session is enrolled but the interrupt flag is not registered
4009  VLOG(1)
4010  << "Interrupting pending query is not available since its interrupt flag is "
4011  "not registered";
4012  }
4013  return;
4014  }
4015  if (queries_interrupt_flag_[query_session]) {
4017  }
4018 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1119
static QuerySessionMap queries_session_map_
Definition: Execute.h:1125
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1163
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1123
#define VLOG(n)
Definition: Logger.h:303
void Executor::clearMemory ( const Data_Namespace::MemoryLevel  memory_level)
static

Definition at line 201 of file Execute.cpp.

References CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCaches().

Referenced by DBHandler::clear_cpu_memory(), DBHandler::clear_gpu_memory(), QueryRunner::QueryRunner::clearCpuMemory(), and QueryRunner::QueryRunner::clearGpuMemory().

201  {
202  switch (memory_level) {
205  mapd_unique_lock<mapd_shared_mutex> flush_lock(
206  execute_mutex_); // Don't flush memory while queries are running
207 
209  if (memory_level == Data_Namespace::MemoryLevel::CPU_LEVEL) {
210  // The hash table cache uses CPU memory not managed by the buffer manager. In the
211  // future, we should manage these allocations with the buffer manager directly.
212  // For now, assume the user wants to purge the hash table cache when they clear
213  // CPU memory (currently used in ExecuteTest to lower memory pressure)
215  }
216  break;
217  }
218  default: {
219  throw std::runtime_error(
220  "Clearing memory levels other than the CPU level or GPU level is not "
221  "supported.");
222  }
223  }
224 }
static mapd_shared_mutex execute_mutex_
Definition: Execute.h:1130
void clearMemory(const MemoryLevel memLevel)
Definition: DataMgr.cpp:431
static void invalidateCaches()
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:217
static SysCatalog & instance()
Definition: SysCatalog.h:325

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::clearMetaInfoCache ( )
private

Definition at line 405 of file Execute.cpp.

References input_table_info_cache_().

405  {
409 }
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:1117
InputTableInfoCache input_table_info_cache_
Definition: Execute.h:1116
TableGenerations table_generations_
Definition: Execute.h:1118

+ Here is the call graph for this function:

void Executor::clearQuerySessionStatus ( const QuerySessionId query_session,
const std::string &  submitted_time_str 
)

Definition at line 4020 of file Execute.cpp.

4021  {
4022  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
4023  // clear the interrupt-related info for a finished query
4024  if (query_session.empty()) {
4025  return;
4026  }
4027  removeFromQuerySessionList(query_session, submitted_time_str, session_write_lock);
4028  if (query_session.compare(current_query_session_) == 0) {
4029  invalidateRunningQuerySession(session_write_lock);
4030  resetInterrupt();
4031  }
4032 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1119
void invalidateRunningQuerySession(mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:3972
QuerySessionId current_query_session_
Definition: Execute.h:1121
void resetInterrupt()
bool removeFromQuerySessionList(const QuerySessionId &query_session, const std::string &submitted_time_str, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:4162
llvm::Value * Executor::codegenAggregateWindowState ( )
private

Definition at line 326 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), Analyzer::WindowFunction::getKind(), kDECIMAL, kDOUBLE, and kFLOAT.

326  {
328  const auto pi32_type =
329  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
330  const auto pi64_type =
331  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
332  const auto window_func_context =
334  const Analyzer::WindowFunction* window_func = window_func_context->getWindowFunction();
335  const auto window_func_ti = get_adjusted_window_type_info(window_func);
336  const auto aggregate_state_type =
337  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
338  auto aggregate_state = aggregateWindowStatePtr();
339  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
340  const auto aggregate_state_count_i64 = cgen_state_->llInt(
341  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
342  auto aggregate_state_count = cgen_state_->ir_builder_.CreateIntToPtr(
343  aggregate_state_count_i64, aggregate_state_type);
344  const auto double_null_lv = cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE));
345  switch (window_func_ti.get_type()) {
346  case kFLOAT: {
347  return cgen_state_->emitCall(
348  "load_avg_float", {aggregate_state, aggregate_state_count, double_null_lv});
349  }
350  case kDOUBLE: {
351  return cgen_state_->emitCall(
352  "load_avg_double", {aggregate_state, aggregate_state_count, double_null_lv});
353  }
354  case kDECIMAL: {
355  return cgen_state_->emitCall(
356  "load_avg_decimal",
357  {aggregate_state,
358  aggregate_state_count,
359  double_null_lv,
360  cgen_state_->llInt<int32_t>(window_func_ti.get_scale())});
361  }
362  default: {
363  return cgen_state_->emitCall(
364  "load_avg_int", {aggregate_state, aggregate_state_count, double_null_lv});
365  }
366  }
367  }
368  if (window_func->getKind() == SqlWindowFunctionKind::COUNT) {
369  return cgen_state_->ir_builder_.CreateLoad(aggregate_state);
370  }
371  switch (window_func_ti.get_type()) {
372  case kFLOAT: {
373  return cgen_state_->emitCall("load_float", {aggregate_state});
374  }
375  case kDOUBLE: {
376  return cgen_state_->emitCall("load_double", {aggregate_state});
377  }
378  default: {
379  return cgen_state_->ir_builder_.CreateLoad(aggregate_state);
380  }
381  }
382 }
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1607
llvm::Value * aggregateWindowStatePtr()
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

void Executor::codegenJoinLoops ( const std::vector< JoinLoop > &  join_loops,
const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
llvm::Function *  query_func,
llvm::BasicBlock *  entry_bb,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const ExecutionOptions eo 
)
private

Definition at line 1126 of file IRCodegen.cpp.

References ExecutionOptions::allow_runtime_query_interrupt, AUTOMATIC_IR_METADATA, CHECK, CHECK_EQ, JoinLoop::codegen(), CompilationOptions::device_type, JoinLoopDomain::element_count, get_int_array_type(), get_int_type(), INNER, MultiSet, CodeGenerator::posArg(), GroupByAndAggregate::query_infos_, query_mem_desc, Set, and ExecutionOptions::with_dynamic_watchdog.

1133  {
1135  const auto exit_bb =
1136  llvm::BasicBlock::Create(cgen_state_->context_, "exit", cgen_state_->current_func_);
1137  cgen_state_->ir_builder_.SetInsertPoint(exit_bb);
1138  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
1139  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
1140  CodeGenerator code_generator(this);
1141 
1142  llvm::BasicBlock* loops_entry_bb{nullptr};
1143  auto has_range_join =
1144  std::any_of(join_loops.begin(), join_loops.end(), [](const auto& join_loop) {
1145  return join_loop.kind() == JoinLoopKind::MultiSet;
1146  });
1147  if (has_range_join) {
1148  CHECK_EQ(join_loops.size(), size_t(1));
1149  const auto element_count =
1150  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_), 9);
1151 
1152  auto compute_packed_offset = [](const int32_t x, const int32_t y) -> uint64_t {
1153  const uint64_t y_shifted = static_cast<uint64_t>(y) << 32;
1154  return y_shifted | static_cast<uint32_t>(x);
1155  };
1156 
1157  const auto values_arr = std::vector<llvm::Constant*>{
1158  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_), 0),
1159  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1160  compute_packed_offset(0, 1)),
1161  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1162  compute_packed_offset(0, -1)),
1163  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1164  compute_packed_offset(1, 0)),
1165  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1166  compute_packed_offset(1, 1)),
1167  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1168  compute_packed_offset(1, -1)),
1169  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1170  compute_packed_offset(-1, 0)),
1171  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1172  compute_packed_offset(-1, 1)),
1173  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1174  compute_packed_offset(-1, -1))};
1175 
1176  const auto constant_values_array = llvm::ConstantArray::get(
1177  get_int_array_type(64, 9, cgen_state_->context_), values_arr);
1178  CHECK(cgen_state_->module_);
1179  const auto values =
1180  new llvm::GlobalVariable(*cgen_state_->module_,
1181  get_int_array_type(64, 9, cgen_state_->context_),
1182  true,
1183  llvm::GlobalValue::LinkageTypes::InternalLinkage,
1184  constant_values_array);
1185  JoinLoop join_loop(
1188  [element_count, values](const std::vector<llvm::Value*>& v) {
1189  JoinLoopDomain domain{{0}};
1190  domain.element_count = element_count;
1191  domain.values_buffer = values;
1192  return domain;
1193  },
1194  nullptr,
1195  nullptr,
1196  nullptr,
1197  nullptr,
1198  "range_key_loop");
1199 
1200  loops_entry_bb = JoinLoop::codegen(
1201  {join_loop},
1202  [this,
1203  query_func,
1204  &query_mem_desc,
1205  &co,
1206  &eo,
1207  &group_by_and_aggregate,
1208  &join_loops,
1209  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
1210  auto& builder = cgen_state_->ir_builder_;
1211 
1212  auto body_exit_bb =
1213  llvm::BasicBlock::Create(cgen_state_->context_,
1214  "range_key_inner_body_exit",
1215  builder.GetInsertBlock()->getParent());
1216 
1217  auto range_key_body_bb =
1218  llvm::BasicBlock::Create(cgen_state_->context_,
1219  "range_key_loop_body",
1220  builder.GetInsertBlock()->getParent());
1221  builder.SetInsertPoint(range_key_body_bb);
1222 
1223  const auto body_loops_entry_bb = JoinLoop::codegen(
1224  join_loops,
1225  [this,
1226  query_func,
1227  &query_mem_desc,
1228  &co,
1229  &eo,
1230  &group_by_and_aggregate,
1231  &join_loops,
1232  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
1233  addJoinLoopIterator(prev_iters, join_loops.size());
1234  auto& builder = cgen_state_->ir_builder_;
1235  const auto loop_body_bb =
1236  llvm::BasicBlock::Create(builder.getContext(),
1237  "loop_body",
1238  builder.GetInsertBlock()->getParent());
1239  builder.SetInsertPoint(loop_body_bb);
1240  const bool can_return_error =
1241  compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
1242  if (can_return_error || cgen_state_->needs_error_check_ ||
1243  eo.with_dynamic_watchdog || eo.allow_runtime_query_interrupt) {
1244  createErrorCheckControlFlow(query_func,
1245  eo.with_dynamic_watchdog,
1246  eo.allow_runtime_query_interrupt,
1247  co.device_type,
1248  group_by_and_aggregate.query_infos_);
1249  }
1250  return loop_body_bb;
1251  },
1252  prev_iters.back(),
1253  body_exit_bb,
1254  cgen_state_.get());
1255 
1256  builder.SetInsertPoint(range_key_body_bb);
1257  cgen_state_->ir_builder_.CreateBr(body_loops_entry_bb);
1258 
1259  builder.SetInsertPoint(body_exit_bb);
1260  return range_key_body_bb;
1261  },
1262  code_generator.posArg(nullptr),
1263  exit_bb,
1264  cgen_state_.get());
1265  } else {
1266  loops_entry_bb = JoinLoop::codegen(
1267  join_loops,
1268  /*body_codegen=*/
1269  [this,
1270  query_func,
1271  &query_mem_desc,
1272  &co,
1273  &eo,
1274  &group_by_and_aggregate,
1275  &join_loops,
1276  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
1278  addJoinLoopIterator(prev_iters, join_loops.size());
1279  auto& builder = cgen_state_->ir_builder_;
1280  const auto loop_body_bb = llvm::BasicBlock::Create(
1281  builder.getContext(), "loop_body", builder.GetInsertBlock()->getParent());
1282  builder.SetInsertPoint(loop_body_bb);
1283  const bool can_return_error =
1284  compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
1285  if (can_return_error || cgen_state_->needs_error_check_ ||
1286  eo.with_dynamic_watchdog || eo.allow_runtime_query_interrupt) {
1287  createErrorCheckControlFlow(query_func,
1288  eo.with_dynamic_watchdog,
1289  eo.allow_runtime_query_interrupt,
1290  co.device_type,
1291  group_by_and_aggregate.query_infos_);
1292  }
1293  return loop_body_bb;
1294  },
1295  /*outer_iter=*/code_generator.posArg(nullptr),
1296  exit_bb,
1297  cgen_state_.get());
1298  }
1299  CHECK(loops_entry_bb);
1300  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
1301  cgen_state_->ir_builder_.CreateBr(loops_entry_bb);
1302 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
llvm::Value * element_count
Definition: JoinLoop.h:46
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
static llvm::BasicBlock * codegen(const std::vector< JoinLoop > &join_loops, const std::function< llvm::BasicBlock *(const std::vector< llvm::Value * > &)> &body_codegen, llvm::Value *outer_iter, llvm::BasicBlock *exit_bb, CgenState *cgen_state)
Definition: JoinLoop.cpp:48
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:1108
#define CHECK(condition)
Definition: Logger.h:209
llvm::ArrayType * get_int_array_type(int const width, int count, llvm::LLVMContext &context)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)

+ Here is the call graph for this function:

llvm::BasicBlock * Executor::codegenSkipDeletedOuterTableRow ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 3039 of file NativeCodegen.cpp.

3041  {
3043  if (!co.filter_on_deleted_column) {
3044  return nullptr;
3045  }
3046  CHECK(!ra_exe_unit.input_descs.empty());
3047  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
3048  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
3049  return nullptr;
3050  }
3051  const auto deleted_cd =
3052  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
3053  if (!deleted_cd) {
3054  return nullptr;
3055  }
3056  CHECK(deleted_cd->columnType.is_boolean());
3057  const auto deleted_expr =
3058  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
3059  outer_input_desc.getTableId(),
3060  deleted_cd->columnId,
3061  outer_input_desc.getNestLevel());
3062  CodeGenerator code_generator(this);
3063  const auto is_deleted =
3064  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
3065  const auto is_deleted_bb = llvm::BasicBlock::Create(
3066  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
3067  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3068  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
3069  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3070  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3071  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3072  cgen_state_->ir_builder_.SetInsertPoint(bb);
3073  return bb;
3074 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:209
void Executor::codegenWindowAvgEpilogue ( llvm::Value *  crt_val,
llvm::Value *  window_func_null_val,
llvm::Value *  multiplicity_lv 
)
private

Definition at line 289 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

291  {
293  const auto window_func_context =
295  const auto window_func = window_func_context->getWindowFunction();
296  const auto window_func_ti = get_adjusted_window_type_info(window_func);
297  const auto pi32_type =
298  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
299  const auto pi64_type =
300  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
301  const auto aggregate_state_type =
302  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
303  const auto aggregate_state_count_i64 = cgen_state_->llInt(
304  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
305  auto aggregate_state_count = cgen_state_->ir_builder_.CreateIntToPtr(
306  aggregate_state_count_i64, aggregate_state_type);
307  std::string agg_count_func_name = "agg_count";
308  switch (window_func_ti.get_type()) {
309  case kFLOAT: {
310  agg_count_func_name += "_float";
311  break;
312  }
313  case kDOUBLE: {
314  agg_count_func_name += "_double";
315  break;
316  }
317  default: {
318  break;
319  }
320  }
321  agg_count_func_name += "_skip_val";
322  cgen_state_->emitCall(agg_count_func_name,
323  {aggregate_state_count, crt_val, window_func_null_val});
324 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunction ( const size_t  target_index,
const CompilationOptions co 
)
private

Definition at line 21 of file WindowFunctionIR.cpp.

References WindowProjectNodeContext::activateWindowFunctionContext(), run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, COUNT, CUME_DIST, DENSE_RANK, logger::FATAL, FIRST_VALUE, WindowProjectNodeContext::get(), WindowFunctionContext::getWindowFunction(), LAG, LAST_VALUE, LEAD, LOG, MAX, MIN, NTILE, PERCENT_RANK, RANK, ROW_NUMBER, and SUM.

22  {
24  CodeGenerator code_generator(this);
25  const auto window_func_context =
27  target_index);
28  const auto window_func = window_func_context->getWindowFunction();
29  switch (window_func->getKind()) {
34  return cgen_state_->emitCall("row_number_window_func",
35  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
36  window_func_context->output())),
37  code_generator.posArg(nullptr)});
38  }
41  return cgen_state_->emitCall("percent_window_func",
42  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
43  window_func_context->output())),
44  code_generator.posArg(nullptr)});
45  }
51  const auto& args = window_func->getArgs();
52  CHECK(!args.empty());
53  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
54  CHECK_EQ(arg_lvs.size(), size_t(1));
55  return arg_lvs.front();
56  }
63  }
64  default: {
65  LOG(FATAL) << "Invalid window function kind";
66  }
67  }
68  return nullptr;
69 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
#define LOG(tag)
Definition: Logger.h:203
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
static const WindowProjectNodeContext * get(Executor *executor)
const WindowFunctionContext * activateWindowFunctionContext(Executor *executor, const size_t target_index) const
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenWindowFunctionAggregate(const CompilationOptions &co)
#define CHECK(condition)
Definition: Logger.h:209
const Analyzer::WindowFunction * getWindowFunction() const

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregate ( const CompilationOptions co)
private

Definition at line 140 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, CHECK, WindowProjectNodeContext::get(), get_int_type(), and WindowProjectNodeContext::getActiveWindowFunctionContext().

140  {
142  const auto reset_state_false_bb = codegenWindowResetStateControlFlow();
143  auto aggregate_state = aggregateWindowStatePtr();
144  llvm::Value* aggregate_state_count = nullptr;
145  const auto window_func_context =
147  const auto window_func = window_func_context->getWindowFunction();
148  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
149  const auto aggregate_state_count_i64 = cgen_state_->llInt(
150  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
151  const auto pi64_type =
152  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
153  aggregate_state_count =
154  cgen_state_->ir_builder_.CreateIntToPtr(aggregate_state_count_i64, pi64_type);
155  }
156  codegenWindowFunctionStateInit(aggregate_state);
157  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
158  const auto count_zero = cgen_state_->llInt(int64_t(0));
159  cgen_state_->emitCall("agg_id", {aggregate_state_count, count_zero});
160  }
161  cgen_state_->ir_builder_.CreateBr(reset_state_false_bb);
162  cgen_state_->ir_builder_.SetInsertPoint(reset_state_false_bb);
164  return codegenWindowFunctionAggregateCalls(aggregate_state, co);
165 }
llvm::Value * aggregateWindowStatePtr()
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
static const WindowProjectNodeContext * get(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
void codegenWindowFunctionStateInit(llvm::Value *aggregate_state)
#define CHECK(condition)
Definition: Logger.h:209
llvm::Value * codegenWindowFunctionAggregateCalls(llvm::Value *aggregate_state, const CompilationOptions &co)
llvm::BasicBlock * codegenWindowResetStateControlFlow()

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregateCalls ( llvm::Value *  aggregate_state,
const CompilationOptions co 
)
private

Definition at line 246 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, CodeGenerator::codegen(), CodeGenerator::codegenCastBetweenIntTypes(), COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), anonymous_namespace{WindowFunctionIR.cpp}::get_window_agg_name(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kFLOAT, and SUM.

247  {
249  const auto window_func_context =
251  const auto window_func = window_func_context->getWindowFunction();
252  const auto window_func_ti = get_adjusted_window_type_info(window_func);
253  const auto window_func_null_val =
254  window_func_ti.is_fp()
255  ? cgen_state_->inlineFpNull(window_func_ti)
256  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
257  const auto& args = window_func->getArgs();
258  llvm::Value* crt_val;
259  if (args.empty()) {
260  CHECK(window_func->getKind() == SqlWindowFunctionKind::COUNT);
261  crt_val = cgen_state_->llInt(int64_t(1));
262  } else {
263  CodeGenerator code_generator(this);
264  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
265  CHECK_EQ(arg_lvs.size(), size_t(1));
266  if (window_func->getKind() == SqlWindowFunctionKind::SUM && !window_func_ti.is_fp()) {
267  crt_val = code_generator.codegenCastBetweenIntTypes(
268  arg_lvs.front(), args.front()->get_type_info(), window_func_ti, false);
269  } else {
270  crt_val = window_func_ti.get_type() == kFLOAT
271  ? arg_lvs.front()
272  : cgen_state_->castToTypeIn(arg_lvs.front(), 64);
273  }
274  }
275  const auto agg_name = get_window_agg_name(window_func->getKind(), window_func_ti);
276  llvm::Value* multiplicity_lv = nullptr;
277  if (args.empty()) {
278  cgen_state_->emitCall(agg_name, {aggregate_state, crt_val});
279  } else {
280  cgen_state_->emitCall(agg_name + "_skip_val",
281  {aggregate_state, crt_val, window_func_null_val});
282  }
283  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
284  codegenWindowAvgEpilogue(crt_val, window_func_null_val, multiplicity_lv);
285  }
287 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string get_window_agg_name(const SqlWindowFunctionKind kind, const SQLTypeInfo &window_func_ti)
void codegenWindowAvgEpilogue(llvm::Value *crt_val, llvm::Value *window_func_null_val, llvm::Value *multiplicity_lv)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenAggregateWindowState()
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

void Executor::codegenWindowFunctionStateInit ( llvm::Value *  aggregate_state)
private

Definition at line 196 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

196  {
198  const auto window_func_context =
200  const auto window_func = window_func_context->getWindowFunction();
201  const auto window_func_ti = get_adjusted_window_type_info(window_func);
202  const auto window_func_null_val =
203  window_func_ti.is_fp()
204  ? cgen_state_->inlineFpNull(window_func_ti)
205  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
206  llvm::Value* window_func_init_val;
207  if (window_func_context->getWindowFunction()->getKind() ==
209  switch (window_func_ti.get_type()) {
210  case kFLOAT: {
211  window_func_init_val = cgen_state_->llFp(float(0));
212  break;
213  }
214  case kDOUBLE: {
215  window_func_init_val = cgen_state_->llFp(double(0));
216  break;
217  }
218  default: {
219  window_func_init_val = cgen_state_->llInt(int64_t(0));
220  break;
221  }
222  }
223  } else {
224  window_func_init_val = window_func_null_val;
225  }
226  const auto pi32_type =
227  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
228  switch (window_func_ti.get_type()) {
229  case kDOUBLE: {
230  cgen_state_->emitCall("agg_id_double", {aggregate_state, window_func_init_val});
231  break;
232  }
233  case kFLOAT: {
234  aggregate_state =
235  cgen_state_->ir_builder_.CreateBitCast(aggregate_state, pi32_type);
236  cgen_state_->emitCall("agg_id_float", {aggregate_state, window_func_init_val});
237  break;
238  }
239  default: {
240  cgen_state_->emitCall("agg_id", {aggregate_state, window_func_init_val});
241  break;
242  }
243  }
244 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

llvm::BasicBlock * Executor::codegenWindowResetStateControlFlow ( )
private

Definition at line 167 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, WindowProjectNodeContext::getActiveWindowFunctionContext(), CodeGenerator::posArg(), and CodeGenerator::toBool().

167  {
169  const auto window_func_context =
171  const auto bitset = cgen_state_->llInt(
172  reinterpret_cast<const int64_t>(window_func_context->partitionStart()));
173  const auto min_val = cgen_state_->llInt(int64_t(0));
174  const auto max_val = cgen_state_->llInt(window_func_context->elementCount() - 1);
175  const auto null_val = cgen_state_->llInt(inline_int_null_value<int64_t>());
176  const auto null_bool_val = cgen_state_->llInt<int8_t>(inline_int_null_value<int8_t>());
177  CodeGenerator code_generator(this);
178  const auto reset_state =
179  code_generator.toBool(cgen_state_->emitCall("bit_is_set",
180  {bitset,
181  code_generator.posArg(nullptr),
182  min_val,
183  max_val,
184  null_val,
185  null_bool_val}));
186  const auto reset_state_true_bb = llvm::BasicBlock::Create(
187  cgen_state_->context_, "reset_state.true", cgen_state_->current_func_);
188  const auto reset_state_false_bb = llvm::BasicBlock::Create(
189  cgen_state_->context_, "reset_state.false", cgen_state_->current_func_);
190  cgen_state_->ir_builder_.CreateCondBr(
191  reset_state, reset_state_true_bb, reset_state_false_bb);
192  cgen_state_->ir_builder_.SetInsertPoint(reset_state_true_bb);
193  return reset_state_false_bb;
194 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)

+ Here is the call graph for this function:

ResultSetPtr Executor::collectAllDeviceResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner 
)
private

Definition at line 1920 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::build_row_for_empty_input(), catalog_(), DEBUG_TIMER, SharedKernelContext::getFragmentResults(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, NonGroupedAggregate, GroupByAndAggregate::shard_count_for_top_groups(), RelAlgExecutionUnit::target_exprs, and use_speculative_top_n().

1925  {
1926  auto timer = DEBUG_TIMER(__func__);
1927  auto& result_per_device = shared_context.getFragmentResults();
1928  if (result_per_device.empty() && query_mem_desc.getQueryDescriptionType() ==
1931  ra_exe_unit.target_exprs, query_mem_desc, device_type);
1932  }
1933  if (use_speculative_top_n(ra_exe_unit, query_mem_desc)) {
1934  try {
1935  return reduceSpeculativeTopN(
1936  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
1937  } catch (const std::bad_alloc&) {
1938  throw SpeculativeTopNFailed("Failed during multi-device reduction.");
1939  }
1940  }
1941  const auto shard_count =
1942  device_type == ExecutorDeviceType::GPU
1944  : 0;
1945 
1946  if (shard_count && !result_per_device.empty()) {
1947  return collectAllDeviceShardedTopResults(shared_context, ra_exe_unit);
1948  }
1949  return reduceMultiDeviceResults(
1950  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
1951 }
std::vector< Analyzer::Expr * > target_exprs
bool use_speculative_top_n(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc)
ResultSetPtr reduceSpeculativeTopN(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:1033
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1102
ResultSetPtr reduceMultiDeviceResults(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:926
ResultSetPtr collectAllDeviceShardedTopResults(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit) const
Definition: Execute.cpp:2035
QueryDescriptionType getQueryDescriptionType() const
ResultSetPtr build_row_for_empty_input(const std::vector< Analyzer::Expr * > &target_exprs_in, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
Definition: Execute.cpp:1878
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define DEBUG_TIMER(name)
Definition: Logger.h:352
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)

+ Here is the call graph for this function:

ResultSetPtr Executor::collectAllDeviceShardedTopResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit 
) const
private

Definition at line 2035 of file Execute.cpp.

References catalog_(), CHECK, CHECK_EQ, CHECK_LE, SharedKernelContext::getFragmentResults(), SortInfo::limit, SortInfo::offset, SortInfo::order_entries, anonymous_namespace{Execute.cpp}::permute_storage_columnar(), anonymous_namespace{Execute.cpp}::permute_storage_row_wise(), run_benchmark_import::result, and RelAlgExecutionUnit::sort_info.

2037  {
2038  auto& result_per_device = shared_context.getFragmentResults();
2039  const auto first_result_set = result_per_device.front().first;
2040  CHECK(first_result_set);
2041  auto top_query_mem_desc = first_result_set->getQueryMemDesc();
2042  CHECK(!top_query_mem_desc.hasInterleavedBinsOnGpu());
2043  const auto top_n = ra_exe_unit.sort_info.limit + ra_exe_unit.sort_info.offset;
2044  top_query_mem_desc.setEntryCount(0);
2045  for (auto& result : result_per_device) {
2046  const auto result_set = result.first;
2047  CHECK(result_set);
2048  result_set->sort(ra_exe_unit.sort_info.order_entries, top_n, this);
2049  size_t new_entry_cnt = top_query_mem_desc.getEntryCount() + result_set->rowCount();
2050  top_query_mem_desc.setEntryCount(new_entry_cnt);
2051  }
2052  auto top_result_set = std::make_shared<ResultSet>(first_result_set->getTargetInfos(),
2053  first_result_set->getDeviceType(),
2054  top_query_mem_desc,
2055  first_result_set->getRowSetMemOwner(),
2056  catalog_,
2057  blockSize(),
2058  gridSize());
2059  auto top_storage = top_result_set->allocateStorage();
2060  size_t top_output_row_idx{0};
2061  for (auto& result : result_per_device) {
2062  const auto result_set = result.first;
2063  CHECK(result_set);
2064  const auto& top_permutation = result_set->getPermutationBuffer();
2065  CHECK_LE(top_permutation.size(), top_n);
2066  if (top_query_mem_desc.didOutputColumnar()) {
2067  top_output_row_idx = permute_storage_columnar(result_set->getStorage(),
2068  result_set->getQueryMemDesc(),
2069  top_storage,
2070  top_output_row_idx,
2071  top_query_mem_desc,
2072  top_permutation);
2073  } else {
2074  top_output_row_idx = permute_storage_row_wise(result_set->getStorage(),
2075  top_storage,
2076  top_output_row_idx,
2077  top_query_mem_desc,
2078  top_permutation);
2079  }
2080  }
2081  CHECK_EQ(top_output_row_idx, top_query_mem_desc.getEntryCount());
2082  return top_result_set;
2083 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
const std::list< Analyzer::OrderEntry > order_entries
size_t permute_storage_row_wise(const ResultSetStorage *input_storage, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:2014
const size_t limit
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1102
#define CHECK_LE(x, y)
Definition: Logger.h:220
unsigned gridSize() const
Definition: Execute.cpp:3440
size_t permute_storage_columnar(const ResultSetStorage *input_storage, const QueryMemoryDescriptor &input_query_mem_desc, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:1964
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define CHECK(condition)
Definition: Logger.h:209
unsigned blockSize() const
Definition: Execute.cpp:3454
const size_t offset

+ Here is the call graph for this function:

bool Executor::compileBody ( const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const GpuSharedMemoryContext gpu_smem_context = {} 
)
private

Definition at line 3076 of file NativeCodegen.cpp.

3080  {
3082 
3083  // Switch the code generation into a separate filter function if enabled.
3084  // Note that accesses to function arguments are still codegenned from the
3085  // row function's arguments, then later automatically forwarded and
3086  // remapped into filter function arguments by redeclareFilterFunction().
3087  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3088  llvm::Value* loop_done{nullptr};
3089  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3090  if (cgen_state_->filter_func_) {
3091  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3092  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3093  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3094  row_func_entry_bb->begin());
3095  loop_done = cgen_state_->ir_builder_.CreateAlloca(
3096  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
3097  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3098  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
3099  }
3100  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3101  cgen_state_->current_func_ = cgen_state_->filter_func_;
3102  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3103  }
3104 
3105  // generate the code for the filter
3106  std::vector<Analyzer::Expr*> primary_quals;
3107  std::vector<Analyzer::Expr*> deferred_quals;
3108  bool short_circuited = CodeGenerator::prioritizeQuals(
3109  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3110  if (short_circuited) {
3111  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
3112  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
3113  << " quals";
3114  }
3115  llvm::Value* filter_lv = cgen_state_->llBool(true);
3116  CodeGenerator code_generator(this);
3117  for (auto expr : primary_quals) {
3118  // Generate the filter for primary quals
3119  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
3120  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3121  }
3122  CHECK(filter_lv->getType()->isIntegerTy(1));
3123  llvm::BasicBlock* sc_false{nullptr};
3124  if (!deferred_quals.empty()) {
3125  auto sc_true = llvm::BasicBlock::Create(
3126  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
3127  sc_false = llvm::BasicBlock::Create(
3128  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
3129  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3130  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3131  if (ra_exe_unit.join_quals.empty()) {
3132  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3133  }
3134  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3135  filter_lv = cgen_state_->llBool(true);
3136  }
3137  for (auto expr : deferred_quals) {
3138  filter_lv = cgen_state_->ir_builder_.CreateAnd(
3139  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
3140  }
3141 
3142  CHECK(filter_lv->getType()->isIntegerTy(1));
3143  auto ret = group_by_and_aggregate.codegen(
3144  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3145 
3146  // Switch the code generation back to the row function if a filter
3147  // function was enabled.
3148  if (cgen_state_->filter_func_) {
3149  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3150  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
3151  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3152  }
3153 
3154  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3155  cgen_state_->current_func_ = cgen_state_->row_func_;
3156  cgen_state_->filter_func_call_ =
3157  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3158 
3159  // Create real filter function declaration after placeholder call
3160  // is emitted.
3162 
3163  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3164  auto loop_done_true = llvm::BasicBlock::Create(
3165  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
3166  auto loop_done_false = llvm::BasicBlock::Create(
3167  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
3168  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
3169  cgen_state_->ir_builder_.CreateCondBr(
3170  loop_done_flag, loop_done_true, loop_done_false);
3171  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3172  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3173  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3174  } else {
3175  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3176  }
3177  }
3178  return ret;
3179 }
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
#define CHECK(condition)
Definition: Logger.h:209
void redeclareFilterFunction()
Definition: IRCodegen.cpp:1009
#define VLOG(n)
Definition: Logger.h:303
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > Executor::compileWorkUnit ( const std::vector< InputTableInfo > &  query_infos,
const PlanState::DeletedColumnsMap deleted_cols_map,
const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const CudaMgr_Namespace::CudaMgr cuda_mgr,
const bool  allow_lazy_fetch,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const size_t  max_groups_buffer_entry_count,
const int8_t  crt_min_byte_width,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache,
RenderInfo render_info = nullptr 
)
private

Definition at line 2562 of file NativeCodegen.cpp.

2574  {
2575  auto timer = DEBUG_TIMER(__func__);
2576 
2578  const auto cuda_mgr = data_mgr_->getCudaMgr();
2579  if (!cuda_mgr) {
2580  throw QueryMustRunOnCpu();
2581  }
2582  }
2583 
2584 #ifndef NDEBUG
2585  static std::uint64_t counter = 0;
2586  ++counter;
2587  VLOG(1) << "CODEGEN #" << counter << ":";
2588  LOG(IR) << "CODEGEN #" << counter << ":";
2589  LOG(PTX) << "CODEGEN #" << counter << ":";
2590  LOG(ASM) << "CODEGEN #" << counter << ":";
2591 #endif
2592 
2593  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2594 
2595  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2596 
2597  GroupByAndAggregate group_by_and_aggregate(
2598  this,
2599  co.device_type,
2600  ra_exe_unit,
2601  query_infos,
2602  row_set_mem_owner,
2603  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2604  : std::nullopt);
2605  auto query_mem_desc =
2606  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2607  max_groups_buffer_entry_guess,
2608  crt_min_byte_width,
2609  render_info,
2611 
2612  if (query_mem_desc->getQueryDescriptionType() ==
2614  !has_cardinality_estimation &&
2615  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2616  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2617  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2618  }
2619 
2620  const bool output_columnar = query_mem_desc->didOutputColumnar();
2621  const bool gpu_shared_mem_optimization =
2623  ra_exe_unit,
2624  cuda_mgr,
2625  co.device_type,
2626  cuda_mgr ? this->blockSize() : 1,
2627  cuda_mgr ? this->numBlocksPerMP() : 1);
2628  if (gpu_shared_mem_optimization) {
2629  // disable interleaved bins optimization on the GPU
2630  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2631  LOG(DEBUG1) << "GPU shared memory is used for the " +
2632  query_mem_desc->queryDescTypeToString() + " query(" +
2633  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2634  query_mem_desc.get())) +
2635  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2636  }
2637 
2638  const GpuSharedMemoryContext gpu_smem_context(
2639  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2640 
2642  const size_t num_count_distinct_descs =
2643  query_mem_desc->getCountDistinctDescriptorsSize();
2644  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2645  const auto& count_distinct_descriptor =
2646  query_mem_desc->getCountDistinctDescriptor(i);
2647  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2648  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2649  !co.hoist_literals)) {
2650  throw QueryMustRunOnCpu();
2651  }
2652  }
2653  }
2654 
2655  // Read the module template and target either CPU or GPU
2656  // by binding the stream position functions to the right implementation:
2657  // stride access for GPU, contiguous for CPU
2658  auto rt_module_copy = llvm::CloneModule(
2659  *g_rt_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
2660  auto func = llvm::dyn_cast<llvm::Function>(gv);
2661  if (!func) {
2662  return true;
2663  }
2664  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2665  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2667  });
2669  if (is_udf_module_present(true)) {
2671  }
2672  if (is_rt_udf_module_present(true)) {
2674  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2675  }
2676  } else {
2677  rt_module_copy->setDataLayout(get_gpu_data_layout());
2678  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2679  if (is_udf_module_present()) {
2681  }
2682  if (is_rt_udf_module_present()) {
2684  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2685  }
2686  }
2687 
2688  cgen_state_->module_ = rt_module_copy.release();
2690 
2691  auto agg_fnames =
2692  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2693 
2694  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2695 
2696  const bool is_group_by{query_mem_desc->isGroupBy()};
2697  auto [query_func, row_func_call] = is_group_by
2699  co.hoist_literals,
2700  *query_mem_desc,
2701  co.device_type,
2702  ra_exe_unit.scan_limit,
2703  gpu_smem_context)
2704  : query_template(cgen_state_->module_,
2705  agg_slot_count,
2706  co.hoist_literals,
2707  !!ra_exe_unit.estimator,
2708  gpu_smem_context);
2709  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2710  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2711  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2712 
2713  cgen_state_->query_func_ = query_func;
2714  cgen_state_->row_func_call_ = row_func_call;
2715  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2716  &query_func->getEntryBlock().front());
2717 
2718  // Generate the function signature and column head fetches s.t.
2719  // double indirection isn't needed in the inner loop
2720  auto& fetch_bb = query_func->front();
2721  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2722  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2723  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2724  query_func->args().begin(),
2725  fetch_ir_builder,
2726  cgen_state_->context_);
2727  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2728 
2729  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2730  is_group_by ? 0 : agg_slot_count,
2731  co.hoist_literals,
2732  cgen_state_->module_,
2733  cgen_state_->context_);
2734  CHECK(cgen_state_->row_func_);
2735  cgen_state_->row_func_bb_ =
2736  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2737 
2739  auto filter_func_ft =
2740  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2741  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2742  llvm::Function::ExternalLinkage,
2743  "filter_func",
2744  cgen_state_->module_);
2745  CHECK(cgen_state_->filter_func_);
2746  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2747  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2748  }
2749 
2750  cgen_state_->current_func_ = cgen_state_->row_func_;
2751  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2752 
2753  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2754  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2755  const auto join_loops =
2756  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2757 
2758  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2759  for (auto& simple_qual : ra_exe_unit.simple_quals) {
2760  plan_state_->addSimpleQual(simple_qual);
2761  }
2762  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2763  if (is_not_deleted_bb) {
2764  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2765  }
2766  if (!join_loops.empty()) {
2767  codegenJoinLoops(join_loops,
2768  body_execution_unit,
2769  group_by_and_aggregate,
2770  query_func,
2771  cgen_state_->row_func_bb_,
2772  *(query_mem_desc.get()),
2773  co,
2774  eo);
2775  } else {
2776  const bool can_return_error = compileBody(
2777  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2778  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2780  createErrorCheckControlFlow(query_func,
2783  co.device_type,
2784  group_by_and_aggregate.query_infos_);
2785  }
2786  }
2787  std::vector<llvm::Value*> hoisted_literals;
2788 
2789  if (co.hoist_literals) {
2790  VLOG(1) << "number of hoisted literals: "
2791  << cgen_state_->query_func_literal_loads_.size()
2792  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2793  << " bytes";
2794  }
2795 
2796  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2797  // we have some hoisted literals...
2798  hoisted_literals = inlineHoistedLiterals();
2799  }
2800 
2801  // replace the row func placeholder call with the call to the actual row func
2802  std::vector<llvm::Value*> row_func_args;
2803  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2804  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2805  }
2806  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2807  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2808  // push hoisted literals arguments, if any
2809  row_func_args.insert(
2810  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2811  llvm::ReplaceInstWithInst(
2812  cgen_state_->row_func_call_,
2813  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2814 
2815  // replace the filter func placeholder call with the call to the actual filter func
2816  if (cgen_state_->filter_func_) {
2817  std::vector<llvm::Value*> filter_func_args;
2818  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2819  arg_it != cgen_state_->filter_func_args_.end();
2820  ++arg_it) {
2821  filter_func_args.push_back(arg_it->first);
2822  }
2823  llvm::ReplaceInstWithInst(
2824  cgen_state_->filter_func_call_,
2825  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2826  }
2827 
2828  // Aggregate
2829  plan_state_->init_agg_vals_ =
2830  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2831 
2832  /*
2833  * If we have decided to use GPU shared memory (decision is not made here), then
2834  * we generate proper code for extra components that it needs (buffer initialization and
2835  * gpu reduction from shared memory to global memory). We then replace these functions
2836  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2837  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2838  */
2839  if (gpu_smem_context.isSharedMemoryUsed()) {
2840  if (query_mem_desc->getQueryDescriptionType() ==
2842  GpuSharedMemCodeBuilder gpu_smem_code(
2843  cgen_state_->module_,
2844  cgen_state_->context_,
2845  *query_mem_desc,
2847  plan_state_->init_agg_vals_);
2848  gpu_smem_code.codegen();
2849  gpu_smem_code.injectFunctionsInto(query_func);
2850 
2851  // helper functions are used for caching purposes later
2852  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2853  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2854  LOG(IR) << gpu_smem_code.toString();
2855  }
2856  }
2857 
2858  auto multifrag_query_func = cgen_state_->module_->getFunction(
2859  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2860  CHECK(multifrag_query_func);
2861 
2864  multifrag_query_func, co.hoist_literals, eo.allow_runtime_query_interrupt);
2865  }
2866 
2867  bind_query(query_func,
2868  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2869  multifrag_query_func,
2870  cgen_state_->module_);
2871 
2872  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2873  if (cgen_state_->filter_func_) {
2874  root_funcs.push_back(cgen_state_->filter_func_);
2875  }
2876  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2877  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2878 
2879  // Always inline the row function and the filter function.
2880  // We don't want register spills in the inner loops.
2881  // LLVM seems to correctly free up alloca instructions
2882  // in these functions even when they are inlined.
2884  if (cgen_state_->filter_func_) {
2886  }
2887 
2888 #ifndef NDEBUG
2889  // Add helpful metadata to the LLVM IR for debugging.
2891 #endif
2892 
2893  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2894  std::string llvm_ir;
2895  if (eo.just_explain) {
2897 #ifdef WITH_JIT_DEBUG
2898  throw std::runtime_error(
2899  "Explain optimized not available when JIT runtime debug symbols are enabled");
2900 #else
2901  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2902  // optimized IR after NVVM reflect
2903  llvm::legacy::PassManager pass_manager;
2904  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2905 #endif // WITH_JIT_DEBUG
2906  }
2907  llvm_ir =
2908  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2909  serialize_llvm_object(cgen_state_->row_func_) +
2910  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2911  : "");
2912 
2913 #ifndef NDEBUG
2914  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2915 #endif
2916  }
2917 
2918  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2919  LOG(IR) << "IR for the "
2920  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2921 #ifdef NDEBUG
2922  LOG(IR) << serialize_llvm_object(query_func)
2923  << serialize_llvm_object(cgen_state_->row_func_)
2924  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2925  : "")
2926  << "\nEnd of IR";
2927 #else
2928  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2929 #endif
2930 
2931  // Run some basic validation checks on the LLVM IR before code is generated below.
2932  verify_function_ir(cgen_state_->row_func_);
2933  if (cgen_state_->filter_func_) {
2934  verify_function_ir(cgen_state_->filter_func_);
2935  }
2936 
2937  // Generate final native code from the LLVM IR.
2938  return std::make_tuple(
2941  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2942  : optimizeAndCodegenGPU(query_func,
2943  multifrag_query_func,
2944  live_funcs,
2945  is_group_by || ra_exe_unit.estimator,
2946  cuda_mgr,
2947  co),
2948  cgen_state_->getLiterals(),
2949  output_columnar,
2950  llvm_ir,
2951  std::move(gpu_smem_context)},
2952  std::move(query_mem_desc));
2953 }
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:218
std::vector< Analyzer::Expr * > target_exprs
bool is_udf_module_present(bool cpu_only=false)
#define CHECK_EQ(x, y)
Definition: Logger.h:217
std::unique_ptr< llvm::Module > rt_udf_cpu_module
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1103
void codegenJoinLoops(const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
Definition: IRCodegen.cpp:1126
std::unique_ptr< llvm::Module > udf_gpu_module
#define LOG(tag)
Definition: Logger.h:203
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
unsigned numBlocksPerMP() const
Definition: Execute.cpp:3449
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
void addTransientStringLiterals(const RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< RowSetMemoryOwner > &row_set_mem_owner)
Definition: Execute.cpp:1739
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
void preloadFragOffsets(const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
Definition: Execute.cpp:3377
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
llvm::StringRef get_gpu_target_triple_string()
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
void verify_function_ir(const llvm::Function *func)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:164
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
std::string serialize_llvm_object(const T *llvm_obj)
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
bool g_enable_filter_function
Definition: Execute.cpp:80
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
void nukeOldState(const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
Definition: Execute.cpp:3358
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:63
#define CHECK(condition)
Definition: Logger.h:209
#define DEBUG_TIMER(name)
Definition: Logger.h:352
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:545
unsigned blockSize() const
Definition: Execute.cpp:3454
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool is_rt_udf_module_present(bool cpu_only=false)
#define VLOG(n)
Definition: Logger.h:303
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:123
AggregatedColRange Executor::computeColRangesCache ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 3857 of file Execute.cpp.

References catalog_(), CHECK, getLeafColumnRange(), AggregatedColRange::setColRange(), and ExpressionRange::typeSupportsRange().

3858  {
3859  AggregatedColRange agg_col_range_cache;
3860  CHECK(catalog_);
3861  std::unordered_set<int> phys_table_ids;
3862  for (const auto& phys_input : phys_inputs) {
3863  phys_table_ids.insert(phys_input.table_id);
3864  }
3865  std::vector<InputTableInfo> query_infos;
3866  for (const int table_id : phys_table_ids) {
3867  query_infos.emplace_back(InputTableInfo{table_id, getTableInfo(table_id)});
3868  }
3869  for (const auto& phys_input : phys_inputs) {
3870  const auto cd =
3871  catalog_->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
3872  CHECK(cd);
3873  if (ExpressionRange::typeSupportsRange(cd->columnType)) {
3874  const auto col_var = boost::make_unique<Analyzer::ColumnVar>(
3875  cd->columnType, phys_input.table_id, phys_input.col_id, 0);
3876  const auto col_range = getLeafColumnRange(col_var.get(), query_infos, this, false);
3877  agg_col_range_cache.setColRange(phys_input, col_range);
3878  }
3879  }
3880  return agg_col_range_cache;
3881 }
Fragmenter_Namespace::TableInfo getTableInfo(const int table_id) const
Definition: Execute.cpp:311
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1102
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
ExpressionRange getLeafColumnRange(const Analyzer::ColumnVar *col_expr, const std::vector< InputTableInfo > &query_infos, const Executor *executor, const bool is_outer_join_proj)
#define CHECK(condition)
Definition: Logger.h:209
void setColRange(const PhysicalInput &, const ExpressionRange &)
static bool typeSupportsRange(const SQLTypeInfo &ti)

+ Here is the call graph for this function:

StringDictionaryGenerations Executor::computeStringDictionaryGenerations ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 3883 of file Execute.cpp.

References catalog_(), CHECK, kENCODING_DICT, and StringDictionaryGenerations::setGeneration().

3884  {
3885  StringDictionaryGenerations string_dictionary_generations;
3886  CHECK(catalog_);
3887  for (const auto& phys_input : phys_inputs) {
3888  const auto cd =
3889  catalog_->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
3890  CHECK(cd);
3891  const auto& col_ti =
3892  cd->columnType.is_array() ? cd->columnType.get_elem_type() : cd->columnType;
3893  if (col_ti.is_string() && col_ti.get_compression() == kENCODING_DICT) {
3894  const int dict_id = col_ti.get_comp_param();
3895  const auto dd = catalog_->getMetadataForDict(dict_id);
3896  CHECK(dd && dd->stringDict);
3897  string_dictionary_generations.setGeneration(dict_id,
3898  dd->stringDict->storageEntryCount());
3899  }
3900  }
3901  return string_dictionary_generations;
3902 }
void setGeneration(const uint32_t id, const uint64_t generation)
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1102
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
const DictDescriptor * getMetadataForDict(int dict_ref, bool loadDict=true) const
Definition: Catalog.cpp:1537
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

TableGenerations Executor::computeTableGenerations ( std::unordered_set< int >  phys_table_ids)
private

Definition at line 3904 of file Execute.cpp.

References TableGenerations::setGeneration().

3905  {
3906  TableGenerations table_generations;
3907  for (const int table_id : phys_table_ids) {
3908  const auto table_info = getTableInfo(table_id);
3909  table_generations.setGeneration(
3910  table_id,
3911  TableGeneration{static_cast<int64_t>(table_info.getPhysicalNumTuples()), 0});
3912  }
3913  return table_generations;
3914 }
Fragmenter_Namespace::TableInfo getTableInfo(const int table_id) const
Definition: Execute.cpp:311
void setGeneration(const uint32_t id, const TableGeneration &generation)

+ Here is the call graph for this function:

bool Executor::containsLeftDeepOuterJoin ( ) const
inline

Definition at line 436 of file Execute.h.

References cgen_state_.

436  {
437  return cgen_state_->contains_left_deep_outer_join_;
438  }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
void Executor::createErrorCheckControlFlow ( llvm::Function *  query_func,
bool  run_with_dynamic_watchdog,
bool  run_with_allowing_runtime_interrupt,
ExecutorDeviceType  device_type,
const std::vector< InputTableInfo > &  input_table_infos 
)
private

Definition at line 1916 of file NativeCodegen.cpp.

1921  {
1923 
1924  // check whether the row processing was successful; currently, it can
1925  // fail by running out of group by buffer slots
1926 
1927  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1928  // when both dynamic watchdog and runtime interrupt turns on
1929  // we use dynamic watchdog
1930  run_with_allowing_runtime_interrupt = false;
1931  }
1932 
1933  {
1934  // disable injecting query interrupt checker if the session info is invalid
1935  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1936  if (current_query_session_.empty()) {
1937  run_with_allowing_runtime_interrupt = false;
1938  }
1939  }
1940 
1941  llvm::Value* row_count = nullptr;
1942  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1943  device_type == ExecutorDeviceType::GPU) {
1944  row_count =
1945  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1946  }
1947 
1948  bool done_splitting = false;
1949  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1950  ++bb_it) {
1951  llvm::Value* pos = nullptr;
1952  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1953  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1954  llvm::isa<llvm::PHINode>(*inst_it)) {
1955  if (inst_it->getName() == "pos") {
1956  pos = &*inst_it;
1957  }
1958  continue;
1959  }
1960  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1961  continue;
1962  }
1963  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1964  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1965  auto next_inst_it = inst_it;
1966  ++next_inst_it;
1967  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1968  auto& br_instr = bb_it->back();
1969  llvm::IRBuilder<> ir_builder(&br_instr);
1970  llvm::Value* err_lv = &*inst_it;
1971  llvm::Value* err_lv_returned_from_row_func = nullptr;
1972  if (run_with_dynamic_watchdog) {
1973  CHECK(pos);
1974  llvm::Value* call_watchdog_lv = nullptr;
1975  if (device_type == ExecutorDeviceType::GPU) {
1976  // In order to make sure all threads within a block see the same barrier,
1977  // only those blocks whose none of their threads have experienced the critical
1978  // edge will go through the dynamic watchdog computation
1979  CHECK(row_count);
1980  auto crit_edge_rem =
1981  (blockSize() & (blockSize() - 1))
1982  ? ir_builder.CreateSRem(
1983  row_count,
1984  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1985  : ir_builder.CreateAnd(
1986  row_count,
1987  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1988  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1989  crit_edge_threshold->setName("crit_edge_threshold");
1990 
1991  // only those threads where pos < crit_edge_threshold go through dynamic
1992  // watchdog call
1993  call_watchdog_lv =
1994  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1995  } else {
1996  // CPU path: run watchdog for every 64th row
1997  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1998  call_watchdog_lv = ir_builder.CreateICmp(
1999  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
2000  }
2001  CHECK(call_watchdog_lv);
2002  auto error_check_bb = bb_it->splitBasicBlock(
2003  llvm::BasicBlock::iterator(br_instr), ".error_check");
2004  auto& watchdog_br_instr = bb_it->back();
2005 
2006  auto watchdog_check_bb = llvm::BasicBlock::Create(
2007  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
2008  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
2009  auto detected_timeout = watchdog_ir_builder.CreateCall(
2010  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
2011  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
2012  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
2013  watchdog_ir_builder.CreateBr(error_check_bb);
2014 
2015  llvm::ReplaceInstWithInst(
2016  &watchdog_br_instr,
2017  llvm::BranchInst::Create(
2018  watchdog_check_bb, error_check_bb, call_watchdog_lv));
2019  ir_builder.SetInsertPoint(&br_instr);
2020  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2021 
2022  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
2023  unified_err_lv->addIncoming(err_lv, &*bb_it);
2024  err_lv = unified_err_lv;
2025  } else if (run_with_allowing_runtime_interrupt) {
2026  CHECK(pos);
2027  llvm::Value* call_check_interrupt_lv = nullptr;
2028  if (device_type == ExecutorDeviceType::GPU) {
2029  // approximate how many times the %pos variable
2030  // is increased --> the number of iteration
2031  // here we calculate the # bit shift by considering grid/block/fragment sizes
2032  // since if we use the fixed one (i.e., per 64-th increment)
2033  // some CUDA threads cannot enter the interrupt checking block depending on
2034  // the fragment size --> a thread may not take care of 64 threads if an outer
2035  // table is not sufficiently large, and so cannot be interrupted
2036  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
2037  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
2038  int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
2039  uint64_t interrupt_checking_freq = 32;
2040  auto freq_control_knob = g_running_query_interrupt_freq;
2041  CHECK_GT(freq_control_knob, 0);
2042  CHECK_LE(freq_control_knob, 1.0);
2043  if (!input_table_infos.empty()) {
2044  const auto& outer_table_info = *input_table_infos.begin();
2045  auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
2046  if (outer_table_info.table_id < 0) {
2047  auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
2048  CHECK(rs);
2049  num_outer_table_tuples = rs->entryCount();
2050  } else {
2051  auto num_frags = outer_table_info.info.fragments.size();
2052  if (num_frags > 0) {
2053  num_outer_table_tuples =
2054  outer_table_info.info.fragments.begin()->getNumTuples();
2055  }
2056  }
2057  if (num_outer_table_tuples > 0) {
2058  // gridSize * blockSize --> pos_step (idx of the next row per thread)
2059  // we additionally multiply two to pos_step since the number of
2060  // dispatched blocks are double of the gridSize
2061  // # tuples (of fragment) / pos_step --> maximum # increment (K)
2062  // also we multiply 1 / freq_control_knob to K to control the frequency
2063  // So, needs to check the interrupt status more frequently? make K smaller
2064  auto max_inc = uint64_t(
2065  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
2066  if (max_inc < 2) {
2067  // too small `max_inc`, so this correction is necessary to make
2068  // `interrupt_checking_freq` be valid (i.e., larger than zero)
2069  max_inc = 2;
2070  }
2071  auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
2072  interrupt_checking_freq =
2073  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
2074  // add the coverage when interrupt_checking_freq > K
2075  // if so, some threads still cannot be branched to the interrupt checker
2076  // so we manually use smaller but close to the max_inc as freq
2077  if (interrupt_checking_freq > max_inc) {
2078  interrupt_checking_freq = max_inc / 2;
2079  }
2080  if (interrupt_checking_freq < 8) {
2081  // such small freq incurs too frequent interrupt status checking,
2082  // so we fixup to the minimum freq value at some reasonable degree
2083  interrupt_checking_freq = 8;
2084  }
2085  }
2086  }
2087  VLOG(1) << "Set the running query interrupt checking frequency: "
2088  << interrupt_checking_freq;
2089  // check the interrupt flag for every interrupt_checking_freq-th iteration
2090  llvm::Value* pos_shifted_per_iteration =
2091  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
2092  auto interrupt_predicate =
2093  ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
2094  call_check_interrupt_lv =
2095  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2096  interrupt_predicate,
2097  cgen_state_->llInt(int64_t(0LL)));
2098  } else {
2099  // CPU path: run interrupt checker for every 64th row
2100  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
2101  call_check_interrupt_lv =
2102  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2103  interrupt_predicate,
2104  cgen_state_->llInt(int64_t(0LL)));
2105  }
2106  CHECK(call_check_interrupt_lv);
2107  auto error_check_bb = bb_it->splitBasicBlock(
2108  llvm::BasicBlock::iterator(br_instr), ".error_check");
2109  auto& check_interrupt_br_instr = bb_it->back();
2110 
2111  auto interrupt_check_bb = llvm::BasicBlock::Create(
2112  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2113  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2114  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2115  cgen_state_->module_->getFunction("check_interrupt"), {});
2116  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
2117  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
2118  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2119 
2120  llvm::ReplaceInstWithInst(
2121  &check_interrupt_br_instr,
2122  llvm::BranchInst::Create(
2123  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
2124  ir_builder.SetInsertPoint(&br_instr);
2125  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2126 
2127  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
2128  unified_err_lv->addIncoming(err_lv, &*bb_it);
2129  err_lv = unified_err_lv;
2130  }
2131  if (!err_lv_returned_from_row_func) {
2132  err_lv_returned_from_row_func = err_lv;
2133  }
2134  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
2135  // let kernel execution finish as expected, regardless of the observed error,
2136  // unless it is from the dynamic watchdog where all threads within that block
2137  // return together.
2138  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2139  err_lv,
2141  } else {
2142  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
2143  err_lv,
2144  cgen_state_->llInt(static_cast<int32_t>(0)));
2145  }
2146  auto error_bb = llvm::BasicBlock::Create(
2147  cgen_state_->context_, ".error_exit", query_func, new_bb);
2148  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
2149  llvm::CallInst::Create(
2150  cgen_state_->module_->getFunction("record_error_code"),
2151  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
2152  "",
2153  error_bb);
2154  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2155  llvm::ReplaceInstWithInst(&br_instr,
2156  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2157  done_splitting = true;
2158  break;
2159  }
2160  }
2161  }
2162  CHECK(done_splitting);
2163 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1119
double g_running_query_interrupt_freq
Definition: Execute.cpp:122
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1163
QuerySessionId current_query_session_
Definition: Execute.h:1121
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1057
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
#define CHECK_GT(x, y)
Definition: Logger.h:221
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:164
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1162
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LE(x, y)
Definition: Logger.h:220
unsigned gridSize() const
Definition: Execute.cpp:3440
#define CHECK(condition)
Definition: Logger.h:209
unsigned blockSize() const
Definition: Execute.cpp:3454
#define VLOG(n)
Definition: Logger.h:303
std::vector< std::unique_ptr< ExecutionKernel > > Executor::createKernels ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
ColumnFetcher column_fetcher,
const std::vector< InputTableInfo > &  table_infos,
const ExecutionOptions eo,
const bool  is_agg,
const bool  allow_single_frag_table_opt,
const size_t  context_count,
const QueryCompilationDescriptor query_comp_desc,
const QueryMemoryDescriptor query_mem_desc,
RenderInfo render_info,
std::unordered_set< int > &  available_gpus,
int &  available_cpus 
)
private

Determines execution dispatch mode and required fragments for a given query step, then creates kernels to execute the query and returns them for launch.

Definition at line 2111 of file Execute.cpp.

References ExecutionOptions::allow_multifrag, catalog_(), CHECK, CHECK_GE, CHECK_GT, anonymous_namespace{Execute.cpp}::checkWorkUnitWatchdog(), data_mgr_(), g_inner_join_fragment_skipping, QueryCompilationDescriptor::getDeviceType(), QueryMemoryDescriptor::getEntryCount(), SharedKernelContext::getFragOffsets(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, ExecutionOptions::gpu_input_mem_limit_percent, Data_Namespace::GPU_LEVEL, anonymous_namespace{Execute.cpp}::has_lazy_fetched_columns(), logger::INFO, RelAlgExecutionUnit::input_descs, KernelPerFragment, LOG, MultifragmentKernel, ExecutionOptions::outer_fragment_indices, Projection, query_mem_desc, RelAlgExecutionUnit::target_exprs, QueryMemoryDescriptor::toString(), RelAlgExecutionUnit::use_bump_allocator, VLOG, and ExecutionOptions::with_watchdog.

2124  {
2125  std::vector<std::unique_ptr<ExecutionKernel>> execution_kernels;
2126 
2127  QueryFragmentDescriptor fragment_descriptor(
2128  ra_exe_unit,
2129  table_infos,
2130  query_comp_desc.getDeviceType() == ExecutorDeviceType::GPU
2132  : std::vector<Data_Namespace::MemoryInfo>{},
2135  CHECK(!ra_exe_unit.input_descs.empty());
2136 
2137  const auto device_type = query_comp_desc.getDeviceType();
2138  const bool uses_lazy_fetch =
2139  plan_state_->allow_lazy_fetch_ &&
2141  const bool use_multifrag_kernel = (device_type == ExecutorDeviceType::GPU) &&
2142  eo.allow_multifrag && (!uses_lazy_fetch || is_agg);
2143  const auto device_count = deviceCount(device_type);
2144  CHECK_GT(device_count, 0);
2145 
2146  fragment_descriptor.buildFragmentKernelMap(ra_exe_unit,
2147  shared_context.getFragOffsets(),
2148  device_count,
2149  device_type,
2150  use_multifrag_kernel,
2152  this);
2153  if (eo.with_watchdog && fragment_descriptor.shouldCheckWorkUnitWatchdog()) {
2154  checkWorkUnitWatchdog(ra_exe_unit, table_infos, *catalog_, device_type, device_count);
2155  }
2156 
2157  if (use_multifrag_kernel) {
2158  VLOG(1) << "Creating multifrag execution kernels";
2159  VLOG(1) << query_mem_desc.toString();
2160 
2161  // NB: We should never be on this path when the query is retried because of running
2162  // out of group by slots; also, for scan only queries on CPU we want the
2163  // high-granularity, fragment by fragment execution instead. For scan only queries on
2164  // GPU, we want the multifrag kernel path to save the overhead of allocating an output
2165  // buffer per fragment.
2166  auto multifrag_kernel_dispatch = [&ra_exe_unit,
2167  &execution_kernels,
2168  &column_fetcher,
2169  &eo,
2170  &query_comp_desc,
2171  &query_mem_desc,
2172  render_info](const int device_id,
2173  const FragmentsList& frag_list,
2174  const int64_t rowid_lookup_key) {
2175  execution_kernels.emplace_back(
2176  std::make_unique<ExecutionKernel>(ra_exe_unit,
2178  device_id,
2179  eo,
2180  column_fetcher,
2181  query_comp_desc,
2182  query_mem_desc,
2183  frag_list,
2185  render_info,
2186  rowid_lookup_key));
2187  };
2188  fragment_descriptor.assignFragsToMultiDispatch(multifrag_kernel_dispatch);
2189  } else {
2190  VLOG(1) << "Creating one execution kernel per fragment";
2191  VLOG(1) << query_mem_desc.toString();
2192 
2193  if (!ra_exe_unit.use_bump_allocator && allow_single_frag_table_opt &&
2194  (query_mem_desc.getQueryDescriptionType() == QueryDescriptionType::Projection) &&
2195  table_infos.size() == 1 && table_infos.front().table_id > 0) {
2196  const auto max_frag_size =
2197  table_infos.front().info.getFragmentNumTuplesUpperBound();
2198  if (max_frag_size < query_mem_desc.getEntryCount()) {
2199  LOG(INFO) << "Lowering scan limit from " << query_mem_desc.getEntryCount()
2200  << " to match max fragment size " << max_frag_size
2201  << " for kernel per fragment execution path.";
2202  throw CompilationRetryNewScanLimit(max_frag_size);
2203  }
2204  }
2205 
2206  size_t frag_list_idx{0};
2207  auto fragment_per_kernel_dispatch = [&ra_exe_unit,
2208  &execution_kernels,
2209  &column_fetcher,
2210  &eo,
2211  &frag_list_idx,
2212  &device_type,
2213  &query_comp_desc,
2214  &query_mem_desc,
2215  render_info](const int device_id,
2216  const FragmentsList& frag_list,
2217  const int64_t rowid_lookup_key) {
2218  if (!frag_list.size()) {
2219  return;
2220  }
2221  CHECK_GE(device_id, 0);
2222 
2223  execution_kernels.emplace_back(
2224  std::make_unique<ExecutionKernel>(ra_exe_unit,
2225  device_type,
2226  device_id,
2227  eo,
2228  column_fetcher,
2229  query_comp_desc,
2230  query_mem_desc,
2231  frag_list,
2233  render_info,
2234  rowid_lookup_key));
2235  ++frag_list_idx;
2236  };
2237 
2238  fragment_descriptor.assignFragsToKernelDispatch(fragment_per_kernel_dispatch,
2239  ra_exe_unit);
2240  }
2241 
2242  return execution_kernels;
2243 }
bool is_agg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1103
ExecutorDeviceType getDeviceType() const
const std::vector< uint64_t > & getFragOffsets()
std::string toString() const
#define LOG(tag)
Definition: Logger.h:203
std::vector< size_t > outer_fragment_indices
std::vector< ColumnLazyFetchInfo > getColLazyFetchInfo(const std::vector< Analyzer::Expr * > &target_exprs) const
Definition: Execute.cpp:364
std::vector< InputDescriptor > input_descs
#define CHECK_GE(x, y)
Definition: Logger.h:222
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:673
#define CHECK_GT(x, y)
Definition: Logger.h:221
std::vector< FragmentsPerTable > FragmentsList
bool g_inner_join_fragment_skipping
Definition: Execute.cpp:87
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1102
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1072
void checkWorkUnitWatchdog(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const Catalog_Namespace::Catalog &cat, const ExecutorDeviceType device_type, const int device_count)
Definition: Execute.cpp:1136
std::vector< MemoryInfo > getMemoryInfo(const MemoryLevel memLevel)
Definition: DataMgr.cpp:350
#define CHECK(condition)
Definition: Logger.h:209
double gpu_input_mem_limit_percent
bool has_lazy_fetched_columns(const std::vector< ColumnLazyFetchInfo > &fetched_cols)
Definition: Execute.cpp:2100
#define VLOG(n)
Definition: Logger.h:303

+ Here is the call graph for this function:

CudaMgr_Namespace::CudaMgr* Executor::cudaMgr ( ) const
inlineprivate

Definition at line 548 of file Execute.h.

References CHECK, data_mgr_, and Data_Namespace::DataMgr::getCudaMgr().

Referenced by isArchPascalOrLater().

548  {
549  CHECK(data_mgr_);
550  auto cuda_mgr = data_mgr_->getCudaMgr();
551  CHECK(cuda_mgr);
552  return cuda_mgr;
553  }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:218
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1103
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int Executor::deviceCount ( const ExecutorDeviceType  device_type) const
private

Definition at line 673 of file Execute.cpp.

References GPU.

673  {
674  if (device_type == ExecutorDeviceType::GPU) {
675  return cudaMgr()->getDeviceCount();
676  } else {
677  return 1;
678  }
679 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:548
int getDeviceCount() const
Definition: CudaMgr.h:86
int Executor::deviceCountForMemoryLevel ( const Data_Namespace::MemoryLevel  memory_level) const
private

Definition at line 681 of file Execute.cpp.

References CPU, GPU, and Data_Namespace::GPU_LEVEL.

682  {
683  return memory_level == GPU_LEVEL ? deviceCount(ExecutorDeviceType::GPU)
684  : deviceCount(ExecutorDeviceType::CPU);
685 }
ExecutorDeviceType
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:673
int64_t Executor::deviceCycles ( int  milliseconds) const
private

Definition at line 3468 of file Execute.cpp.

3468  {
3469  const auto& dev_props = cudaMgr()->getAllDeviceProperties();
3470  return static_cast<int64_t>(dev_props.front().clockKhz) * milliseconds;
3471 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:548
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:120
void Executor::enableRuntimeQueryInterrupt ( const double  runtime_query_check_freq,
const unsigned  pending_query_check_freq 
) const

Definition at line 4227 of file Execute.cpp.

References g_enable_runtime_query_interrupt, g_pending_query_interrupt_freq, and g_running_query_interrupt_freq.

4229  {
4230  // The only one scenario that we intentionally call this function is
4231  // to allow runtime query interrupt in QueryRunner for test cases.
4232  // Because test machine's default setting does not allow runtime query interrupt,
4233  // so we have to turn it on within test code if necessary.
4235  g_pending_query_interrupt_freq = pending_query_check_freq;
4236  g_running_query_interrupt_freq = runtime_query_check_freq;
4239  }
4240 }
double g_running_query_interrupt_freq
Definition: Execute.cpp:122
unsigned g_pending_query_interrupt_freq
Definition: Execute.cpp:121
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:118
void Executor::enrollQuerySession ( const QuerySessionId query_session,
const std::string &  query_str,
const std::string &  submitted_time_str,
const size_t  executor_id,
const QuerySessionStatus::QueryStatus  query_session_status 
)

Definition at line 4050 of file Execute.cpp.

4055  {
4056  // enroll the query session into the Executor's session map
4057  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
4058  if (query_session.empty()) {
4059  return;
4060  }
4061 
4062  addToQuerySessionList(query_session,
4063  query_str,
4064  submitted_time_str,
4065  executor_id,
4066  query_session_status,
4067  session_write_lock);
4068 
4069  if (query_session_status == QuerySessionStatus::QueryStatus::RUNNING_QUERY_KERNEL) {
4070  current_query_session_ = query_session;
4071  }
4072 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1119
QuerySessionId current_query_session_
Definition: Execute.h:1121
bool addToQuerySessionList(const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:4074
ResultSetPtr Executor::executeExplain ( const QueryCompilationDescriptor query_comp_desc)
private

Definition at line 1735 of file Execute.cpp.

References QueryCompilationDescriptor::getIR().

1735  {
1736  return std::make_shared<ResultSet>(query_comp_desc.getIR());
1737 }

+ Here is the call graph for this function:

int32_t Executor::executePlanWithGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< size_t >  outer_tab_frag_ids,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const int  outer_table_id,
const int64_t  limit,
const uint32_t  start_rowid,
const uint32_t  num_tables,
const bool  allow_runtime_interrupt,
RenderInfo render_info,
const int64_t  rows_to_process = -1 
)
private

Definition at line 3155 of file Execute.cpp.

References anonymous_namespace{Utm.h}::a, CHECK, CHECK_NE, anonymous_namespace{Execute.cpp}::check_rows_less_than_needed(), CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, ERR_WIDTH_BUCKET_INVALID_ARGUMENT, logger::FATAL, g_enable_dynamic_watchdog, CompilationResult::generated_code, QueryMemoryDescriptor::getEntryCount(), QueryExecutionContext::getRowSet(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, RelAlgExecutionUnit::groupby_exprs, INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, shared::printContainer(), QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, RenderInfo::render_allocator_map_ptr, RelAlgExecutionUnit::scan_limit, QueryMemoryDescriptor::setEntryCount(), RelAlgExecutionUnit::union_all, RenderInfo::useCudaBuffers(), and VLOG.

3174  {
3175  auto timer = DEBUG_TIMER(__func__);
3177  // TODO: get results via a separate method, but need to do something with literals.
3178  CHECK(!results || !(*results));
3179  if (col_buffers.empty()) {
3180  return 0;
3181  }
3182  CHECK_NE(ra_exe_unit.groupby_exprs.size(), size_t(0));
3183  // TODO(alex):
3184  // 1. Optimize size (make keys more compact).
3185  // 2. Resize on overflow.
3186  // 3. Optimize runtime.
3187  auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
3188  int32_t error_code = device_type == ExecutorDeviceType::GPU ? 0 : start_rowid;
3189  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
3190  if (allow_runtime_interrupt) {
3191  bool isInterrupted = false;
3192  {
3193  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
3194  const auto query_session = getCurrentQuerySession(session_read_lock);
3195  isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
3196  }
3197  if (isInterrupted) {
3199  }
3200  }
3201  if (g_enable_dynamic_watchdog && interrupted_.load()) {
3202  return ERR_INTERRUPTED;
3203  }
3204 
3205  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
3206  if (render_info && render_info->useCudaBuffers()) {
3207  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
3208  }
3209 
3210  VLOG(2) << "bool(ra_exe_unit.union_all)=" << bool(ra_exe_unit.union_all)
3211  << " ra_exe_unit.input_descs="
3212  << shared::printContainer(ra_exe_unit.input_descs)
3213  << " ra_exe_unit.input_col_descs="
3214  << shared::printContainer(ra_exe_unit.input_col_descs)
3215  << " ra_exe_unit.scan_limit=" << ra_exe_unit.scan_limit
3216  << " num_rows=" << shared::printContainer(num_rows)
3217  << " frag_offsets=" << shared::printContainer(frag_offsets)
3218  << " query_exe_context->query_buffers_->num_rows_="
3219  << query_exe_context->query_buffers_->num_rows_
3220  << " query_exe_context->query_mem_desc_.getEntryCount()="
3221  << query_exe_context->query_mem_desc_.getEntryCount()
3222  << " device_id=" << device_id << " outer_table_id=" << outer_table_id
3223  << " scan_limit=" << scan_limit << " start_rowid=" << start_rowid
3224  << " num_tables=" << num_tables;
3225 
3226  RelAlgExecutionUnit ra_exe_unit_copy = ra_exe_unit;
3227  // For UNION ALL, filter out input_descs and input_col_descs that are not associated
3228  // with outer_table_id.
3229  if (ra_exe_unit_copy.union_all) {
3230  // Sort outer_table_id first, then pop the rest off of ra_exe_unit_copy.input_descs.
3231  std::stable_sort(ra_exe_unit_copy.input_descs.begin(),
3232  ra_exe_unit_copy.input_descs.end(),
3233  [outer_table_id](auto const& a, auto const& b) {
3234  return a.getTableId() == outer_table_id &&
3235  b.getTableId() != outer_table_id;
3236  });
3237  while (!ra_exe_unit_copy.input_descs.empty() &&
3238  ra_exe_unit_copy.input_descs.back().getTableId() != outer_table_id) {
3239  ra_exe_unit_copy.input_descs.pop_back();
3240  }
3241  // Filter ra_exe_unit_copy.input_col_descs.
3242  ra_exe_unit_copy.input_col_descs.remove_if(
3243  [outer_table_id](auto const& input_col_desc) {
3244  return input_col_desc->getScanDesc().getTableId() != outer_table_id;
3245  });
3246  query_exe_context->query_mem_desc_.setEntryCount(ra_exe_unit_copy.scan_limit);
3247  }
3248 
3249  if (device_type == ExecutorDeviceType::CPU) {
3250  const int32_t scan_limit_for_query =
3251  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
3252  const int32_t max_matched = scan_limit_for_query == 0
3253  ? query_exe_context->query_mem_desc_.getEntryCount()
3254  : scan_limit_for_query;
3255 
3256  auto cpu_generated_code = std::dynamic_pointer_cast<CpuCompilationContext>(
3257  compilation_result.generated_code);
3258  CHECK(cpu_generated_code);
3259  query_exe_context->launchCpuCode(ra_exe_unit_copy,
3260  cpu_generated_code.get(),
3261  hoist_literals,
3262  hoist_buf,
3263  col_buffers,
3264  num_rows,
3265  frag_offsets,
3266  max_matched,
3267  &error_code,
3268  num_tables,
3269  join_hash_table_ptrs,
3270  rows_to_process);
3271  } else {
3272  try {
3273  auto gpu_generated_code = std::dynamic_pointer_cast<GpuCompilationContext>(
3274  compilation_result.generated_code);
3275  CHECK(gpu_generated_code);
3276  query_exe_context->launchGpuCode(
3277  ra_exe_unit_copy,
3278  gpu_generated_code.get(),
3279  hoist_literals,
3280  hoist_buf,
3281  col_buffers,
3282  num_rows,
3283  frag_offsets,
3284  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit,
3285  data_mgr,
3286  blockSize(),
3287  gridSize(),
3288  device_id,
3289  compilation_result.gpu_smem_context.getSharedMemorySize(),
3290  &error_code,
3291  num_tables,
3292  allow_runtime_interrupt,
3293  join_hash_table_ptrs,
3294  render_allocator_map_ptr);
3295  } catch (const OutOfMemory&) {
3296  return ERR_OUT_OF_GPU_MEM;
3297  } catch (const OutOfRenderMemory&) {
3298  return ERR_OUT_OF_RENDER_MEM;
3299  } catch (const StreamingTopNNotSupportedInRenderQuery&) {
3301  } catch (const std::exception& e) {
3302  LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
3303  }
3304  }
3305 
3306  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
3307  error_code == Executor::ERR_DIV_BY_ZERO ||
3308  error_code == Executor::ERR_OUT_OF_TIME ||
3309  error_code == Executor::ERR_INTERRUPTED ||
3311  error_code == Executor::ERR_GEOS ||
3313  return error_code;
3314  }
3315 
3316  if (results && error_code != Executor::ERR_OVERFLOW_OR_UNDERFLOW &&
3317  error_code != Executor::ERR_DIV_BY_ZERO && !render_allocator_map_ptr) {
3318  *results = query_exe_context->getRowSet(ra_exe_unit_copy,
3319  query_exe_context->query_mem_desc_);
3320  CHECK(*results);
3321  VLOG(2) << "results->rowCount()=" << (*results)->rowCount();
3322  (*results)->holdLiterals(hoist_buf);
3323  }
3324  if (error_code < 0 && render_allocator_map_ptr) {
3325  auto const adjusted_scan_limit =
3326  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
3327  // More rows passed the filter than available slots. We don't have a count to check,
3328  // so assume we met the limit if a scan limit is set
3329  if (adjusted_scan_limit != 0) {
3330  return 0;
3331  } else {
3332  return error_code;
3333  }
3334  }
3335  if (results && error_code &&
3336  (!scan_limit || check_rows_less_than_needed(*results, scan_limit))) {
3337  return error_code; // unlucky, not enough results and we ran out of slots
3338  }
3339 
3340  return 0;
3341 }
QuerySessionId & getCurrentQuerySession(mapd_shared_lock< mapd_shared_mutex > &read_lock)
Definition: Execute.cpp:3946
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1119
int32_t executePlanWithGroupBy(const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr *results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const int outer_table_id, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const int64_t rows_to_process=-1)
Definition: Execute.cpp:3155
bool useCudaBuffers() const
Definition: RenderInfo.cpp:68
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1163
std::vector< int8_t * > getJoinHashTablePtrs(const ExecutorDeviceType device_type, const int device_id)
Definition: Execute.cpp:3343
void setEntryCount(const size_t val)
std::atomic< bool > interrupted_
Definition: Execute.h:1082
GpuSharedMemoryContext gpu_smem_context
const std::optional< bool > union_all
#define LOG(tag)
Definition: Logger.h:203
size_t getSharedMemorySize() const
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t num_tables, const std::vector< int8_t * > &join_hash_tables, const int64_t num_rows_to_process=-1)
std::vector< InputDescriptor > input_descs
static const int32_t ERR_GEOS
Definition: Execute.h:1169
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool checkIsQuerySessionInterrupted(const std::string &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
Definition: Execute.cpp:4207
constexpr double a
Definition: Utm.h:38
std::unique_ptr< QueryMemoryInitializer > query_buffers_
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY
Definition: Execute.h:1167
static const int32_t ERR_DIV_BY_ZERO
Definition: Execute.h:1155
ResultSetPtr getRowSet(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc) const
#define INJECT_TIMER(DESC)
Definition: measure.h:93
#define CHECK_NE(x, y)
Definition: Logger.h:218
static const int32_t ERR_OUT_OF_RENDER_MEM
Definition: Execute.h:1159
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:1161
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1162
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:1168
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:1156
std::shared_ptr< CompilationContext > generated_code
QueryMemoryDescriptor query_mem_desc_
std::vector< int8_t > serializeLiterals(const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
Definition: Execute.cpp:411
unsigned gridSize() const
Definition: Execute.cpp:3440
std::unordered_map< int, CgenState::LiteralValues > literal_values