OmniSciDB  bf83d84833
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Executor Class Reference

#include <Execute.h>

+ Collaboration diagram for Executor:

Classes

class  FetchCacheAnchor
 
struct  GroupColLLVMValue
 
struct  JoinHashTableOrError
 

Public Types

using ExecutorId = size_t
 
using CachedCardinality = std::pair< bool, size_t >
 

Public Member Functions

 Executor (const ExecutorId id, const size_t block_size_x, const size_t grid_size_x, const size_t max_gpu_slab_size, const std::string &debug_dir, const std::string &debug_file)
 
const TemporaryTablesgetTemporaryTables ()
 
StringDictionaryProxygetStringDictionaryProxy (const int dict_id, const bool with_generation) const
 
StringDictionaryProxygetStringDictionaryProxy (const int dictId, const std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const
 
bool isCPUOnly () const
 
bool isArchMaxwell (const ExecutorDeviceType dt) const
 
bool containsLeftDeepOuterJoin () const
 
const ColumnDescriptorgetColumnDescriptor (const Analyzer::ColumnVar *) const
 
const ColumnDescriptorgetPhysicalColumnDescriptor (const Analyzer::ColumnVar *, int) const
 
const Catalog_Namespace::CataloggetCatalog () const
 
void setCatalog (const Catalog_Namespace::Catalog *catalog)
 
const std::shared_ptr
< RowSetMemoryOwner
getRowSetMemoryOwner () const
 
const TemporaryTablesgetTemporaryTables () const
 
Fragmenter_Namespace::TableInfo getTableInfo (const int table_id) const
 
const TableGenerationgetTableGeneration (const int table_id) const
 
ExpressionRange getColRange (const PhysicalInput &) const
 
size_t getNumBytesForFetchedRow (const std::set< int > &table_ids_to_fetch) const
 
std::vector< ColumnLazyFetchInfogetColLazyFetchInfo (const std::vector< Analyzer::Expr * > &target_exprs) const
 
void registerActiveModule (void *module, const int device_id) const
 
void unregisterActiveModule (void *module, const int device_id) const
 
void interrupt (const std::string &query_session="", const std::string &interrupt_session="")
 
void resetInterrupt ()
 
void enableRuntimeQueryInterrupt (const double runtime_query_check_freq, const unsigned pending_query_check_freq) const
 
int8_t warpSize () const
 
unsigned gridSize () const
 
unsigned numBlocksPerMP () const
 
unsigned blockSize () const
 
size_t maxGpuSlabSize () const
 
ResultSetPtr executeWorkUnit (size_t &max_groups_buffer_entry_guess, const bool is_agg, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
void executeUpdate (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const UpdateLogForFragment::Callback &cb, const bool is_agg)
 
void setupCaching (const std::unordered_set< PhysicalInput > &phys_inputs, const std::unordered_set< int > &phys_table_ids)
 
void setColRangeCache (const AggregatedColRange &aggregated_col_range)
 
QuerySessionIdgetCurrentQuerySession (mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
size_t getRunningExecutorId (mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
bool checkCurrentQuerySession (const std::string &candidate_query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
void invalidateRunningQuerySession (mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool addToQuerySessionList (const QuerySessionId &query_session, const std::string &query_str, const std::chrono::time_point< std::chrono::system_clock > submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool removeFromQuerySessionList (const QuerySessionId &query_session, const std::chrono::time_point< std::chrono::system_clock > submitted, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
void setQuerySessionAsInterrupted (const QuerySessionId &query_session, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool checkIsQuerySessionInterrupted (const QuerySessionId &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
bool checkIsQuerySessionEnrolled (const QuerySessionId &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
bool updateQuerySessionStatusWithLock (const QuerySessionId &query_session, const std::chrono::time_point< std::chrono::system_clock > submitted, const QuerySessionStatus::QueryStatus updated_query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
bool updateQuerySessionExecutorAssignment (const QuerySessionId &query_session, const std::chrono::time_point< std::chrono::system_clock > submitted, const size_t executor_id, mapd_unique_lock< mapd_shared_mutex > &write_lock)
 
std::vector< QuerySessionStatusgetQuerySessionInfo (const QuerySessionId &query_session, mapd_shared_lock< mapd_shared_mutex > &read_lock)
 
mapd_shared_mutexgetSessionLock ()
 
CurrentQueryStatus attachExecutorToQuerySession (std::shared_ptr< const query_state::QueryState > &query_state)
 
void checkPendingQueryStatus (const QuerySessionId &query_session)
 
void clearQuerySessionStatus (const QuerySessionId &query_session, const std::chrono::time_point< std::chrono::system_clock > submitted, bool acquire_spin_lock)
 
void updateQuerySessionStatus (std::shared_ptr< const query_state::QueryState > &query_state, const QuerySessionStatus::QueryStatus new_query_status)
 
void updateQuerySessionStatus (const QuerySessionId &query_session, const std::chrono::time_point< std::chrono::system_clock > submitted, const QuerySessionStatus::QueryStatus new_query_status)
 
void enrollQuerySession (const QuerySessionId &query_session, const std::string &query_str, const std::chrono::time_point< std::chrono::system_clock > submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_session_status)
 
void addToCardinalityCache (const std::string &cache_key, const size_t cache_value)
 
CachedCardinality getCachedCardinality (const std::string &cache_key)
 
template<typename THREAD_POOL >
void launchKernels (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels)
 

Static Public Member Functions

static std::shared_ptr< ExecutorgetExecutor (const ExecutorId id, const std::string &debug_dir="", const std::string &debug_file="", const SystemParameters system_parameters=SystemParameters())
 
static void nukeCacheOfExecutors ()
 
static void clearMemory (const Data_Namespace::MemoryLevel memory_level)
 
static size_t getArenaBlockSize ()
 
static std::pair< int64_t,
int32_t > 
reduceResults (const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)
 
static void addCodeToCache (const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
 

Static Public Attributes

static const ExecutorId UNITARY_EXECUTOR_ID = 0
 
static const size_t high_scan_limit
 
static const int32_t ERR_DIV_BY_ZERO {1}
 
static const int32_t ERR_OUT_OF_GPU_MEM {2}
 
static const int32_t ERR_OUT_OF_SLOTS {3}
 
static const int32_t ERR_UNSUPPORTED_SELF_JOIN {4}
 
static const int32_t ERR_OUT_OF_RENDER_MEM {5}
 
static const int32_t ERR_OUT_OF_CPU_MEM {6}
 
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW {7}
 
static const int32_t ERR_OUT_OF_TIME {9}
 
static const int32_t ERR_INTERRUPTED {10}
 
static const int32_t ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED {11}
 
static const int32_t ERR_TOO_MANY_LITERALS {12}
 
static const int32_t ERR_STRING_CONST_IN_RESULTSET {13}
 
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY {14}
 
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES {15}
 
static const int32_t ERR_GEOS {16}
 
static std::mutex compilation_mutex_
 
static std::mutex kernel_mutex_
 

Private Types

using PerFragmentCallBack = std::function< void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo &)>
 

Private Member Functions

void clearMetaInfoCache ()
 
int deviceCount (const ExecutorDeviceType) const
 
int deviceCountForMemoryLevel (const Data_Namespace::MemoryLevel memory_level) const
 
llvm::Value * codegenWindowFunction (const size_t target_index, const CompilationOptions &co)
 
llvm::Value * codegenWindowFunctionAggregate (const CompilationOptions &co)
 
llvm::BasicBlock * codegenWindowResetStateControlFlow ()
 
void codegenWindowFunctionStateInit (llvm::Value *aggregate_state)
 
llvm::Value * codegenWindowFunctionAggregateCalls (llvm::Value *aggregate_state, const CompilationOptions &co)
 
void codegenWindowAvgEpilogue (llvm::Value *crt_val, llvm::Value *window_func_null_val, llvm::Value *multiplicity_lv)
 
llvm::Value * codegenAggregateWindowState ()
 
llvm::Value * aggregateWindowStatePtr ()
 
bool isArchPascalOrLater (const ExecutorDeviceType dt) const
 
bool needFetchAllFragments (const InputColDescriptor &col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments) const
 
void executeWorkUnitPerFragment (const RelAlgExecutionUnit &ra_exe_unit, const InputTableInfo &table_info, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, PerFragmentCallBack &cb)
 Compiles and dispatches a work unit per fragment processing results with the per fragment callback. Currently used for computing metrics over fragments (metadata). More...
 
ResultSetPtr executeExplain (const QueryCompilationDescriptor &)
 
ResultSetPtr executeTableFunction (const TableFunctionExecutionUnit exe_unit, const std::vector< InputTableInfo > &table_infos, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat)
 Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps. More...
 
ExecutorDeviceType getDeviceTypeForTargets (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType requested_device_type)
 
ResultSetPtr collectAllDeviceResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
 
ResultSetPtr collectAllDeviceShardedTopResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit) const
 
std::unordered_map< int, const
Analyzer::BinOper * > 
getInnerTabIdToJoinCond () const
 
std::vector< std::unique_ptr
< ExecutionKernel > > 
createKernels (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, ColumnFetcher &column_fetcher, const std::vector< InputTableInfo > &table_infos, const ExecutionOptions &eo, const bool is_agg, const bool allow_single_frag_table_opt, const size_t context_count, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, RenderInfo *render_info, std::unordered_set< int > &available_gpus, int &available_cpus)
 
template<typename THREAD_POOL >
void launchKernels (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels)
 
std::vector< size_t > getTableFragmentIndices (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type, const size_t table_idx, const size_t outer_frag_idx, std::map< int, const TableFragments * > &selected_tables_fragments, const std::unordered_map< int, const Analyzer::BinOper * > &inner_table_id_to_join_condition)
 
bool skipFragmentPair (const Fragmenter_Namespace::FragmentInfo &outer_fragment_info, const Fragmenter_Namespace::FragmentInfo &inner_fragment_info, const int inner_table_id, const std::unordered_map< int, const Analyzer::BinOper * > &inner_table_id_to_join_condition, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
 
FetchResult fetchChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< int, const TableFragments * > &, const FragmentsList &selected_fragments, const Catalog_Namespace::Catalog &, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator)
 
FetchResult fetchUnionChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< int, const TableFragments * > &, const FragmentsList &selected_fragments, const Catalog_Namespace::Catalog &, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator)
 
std::pair< std::vector
< std::vector< int64_t >
>, std::vector< std::vector
< uint64_t > > > 
getRowCountAndOffsetForAllFrags (const RelAlgExecutionUnit &ra_exe_unit, const CartesianProduct< std::vector< std::vector< size_t >>> &frag_ids_crossjoin, const std::vector< InputDescriptor > &input_descs, const std::map< int, const TableFragments * > &all_tables_fragments)
 
void buildSelectedFragsMapping (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
void buildSelectedFragsMappingForUnion (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< size_t > getFragmentCount (const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
 
int32_t executePlanWithGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const int outer_table_id, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, RenderInfo *render_info)
 
int32_t executePlanWithoutGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const std::vector< Analyzer::Expr * > &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, QueryExecutionContext *query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, RenderInfo *render_info)
 
ResultSetPtr resultsUnion (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< int64_t > getJoinHashTablePtrs (const ExecutorDeviceType device_type, const int device_id)
 
ResultSetPtr reduceMultiDeviceResults (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceMultiDeviceResultSets (std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceSpeculativeTopN (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr executeWorkUnitImpl (size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
std::vector< llvm::Value * > inlineHoistedLiterals ()
 
std::tuple< CompilationResult,
std::unique_ptr
< QueryMemoryDescriptor > > 
compileWorkUnit (const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
 
llvm::BasicBlock * codegenSkipDeletedOuterTableRow (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
std::vector< JoinLoopbuildJoinLoops (RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
 
std::function< llvm::Value
*(const std::vector
< llvm::Value * >
&, llvm::Value *)> 
buildIsDeletedCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
 
std::shared_ptr< HashJoinbuildCurrentLevelHashTable (const JoinCondition &current_level_join_conditions, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
 
void redeclareFilterFunction ()
 
llvm::Value * addJoinLoopIterator (const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
 
void codegenJoinLoops (const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
 
bool compileBody (const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
 
void createErrorCheckControlFlow (llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
 
void insertErrorCodeChecker (llvm::Function *query_func, bool hoist_literals)
 
void preloadFragOffsets (const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
 
JoinHashTableOrError buildHashTableForQualifier (const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const HashType preferred_hash_type, ColumnCacheMap &column_cache)
 
void nukeOldState (const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
 
std::shared_ptr
< CompilationContext
optimizeAndCodegenCPU (llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
 
std::shared_ptr
< CompilationContext
optimizeAndCodegenGPU (llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
 
std::string generatePTX (const std::string &) const
 
void initializeNVPTXBackend () const
 
int64_t deviceCycles (int milliseconds) const
 
GroupColLLVMValue groupByColumnCodegen (Analyzer::Expr *group_by_col, const size_t col_width, const CompilationOptions &, const bool translate_null_val, const int64_t translated_null_val, GroupByAndAggregate::DiamondCodegen &, std::stack< llvm::BasicBlock * > &, const bool thread_mem_shared)
 
llvm::Value * castToFP (llvm::Value *val)
 
llvm::Value * castToIntPtrTyIn (llvm::Value *val, const size_t bit_width)
 
std::tuple
< RelAlgExecutionUnit,
PlanState::DeletedColumnsMap
addDeletedColumn (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
std::pair< bool, int64_t > skipFragment (const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &frag_info, const std::list< std::shared_ptr< Analyzer::Expr >> &simple_quals, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
std::pair< bool, int64_t > skipFragmentInnerJoins (const InputDescriptor &table_desc, const RelAlgExecutionUnit &ra_exe_unit, const Fragmenter_Namespace::FragmentInfo &fragment, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
AggregatedColRange computeColRangesCache (const std::unordered_set< PhysicalInput > &phys_inputs)
 
StringDictionaryGenerations computeStringDictionaryGenerations (const std::unordered_set< PhysicalInput > &phys_inputs)
 
TableGenerations computeTableGenerations (std::unordered_set< int > phys_table_ids)
 
std::shared_ptr
< CompilationContext
getCodeFromCache (const CodeCacheKey &, const CodeCache &)
 
std::vector< int8_t > serializeLiterals (const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
 
llvm::Value * spillDoubleElement (llvm::Value *elem_val, llvm::Type *elem_ty)
 

Static Private Member Functions

static size_t align (const size_t off_in, const size_t alignment)
 

Private Attributes

std::unique_ptr< CgenStatecgen_state_
 
std::unique_ptr< PlanStateplan_state_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::mutex gpu_exec_mutex_ [max_gpu_count]
 
std::mutex str_dict_mutex_
 
std::unique_ptr
< llvm::TargetMachine > 
nvptx_target_machine_
 
CodeCache cpu_code_cache_
 
CodeCache gpu_code_cache_
 
const unsigned block_size_x_
 
const unsigned grid_size_x_
 
const size_t max_gpu_slab_size_
 
const std::string debug_dir_
 
const std::string debug_file_
 
const ExecutorId executor_id_
 
const Catalog_Namespace::Catalogcatalog_
 
const TemporaryTablestemporary_tables_
 
int64_t kernel_queue_time_ms_ = 0
 
int64_t compilation_queue_time_ms_ = 0
 
std::unique_ptr
< WindowProjectNodeContext
window_project_node_context_owned_
 
WindowFunctionContextactive_window_function_ {nullptr}
 
InputTableInfoCache input_table_info_cache_
 
AggregatedColRange agg_col_range_cache_
 
TableGenerations table_generations_
 

Static Private Attributes

static const int max_gpu_count {16}
 
static std::mutex gpu_active_modules_mutex_
 
static uint32_t gpu_active_modules_device_mask_ {0x0}
 
static void * gpu_active_modules_ [max_gpu_count]
 
static std::atomic< bool > interrupted_ {false}
 
static const size_t baseline_threshold
 
static const size_t code_cache_size {1000}
 
static mapd_shared_mutex executor_session_mutex_
 
static QuerySessionId current_query_session_ {""}
 
static size_t running_query_executor_id_ {0}
 
static InterruptFlagMap queries_interrupt_flag_
 
static QuerySessionMap queries_session_map_
 
static std::map< int,
std::shared_ptr< Executor > > 
executors_
 
static std::atomic_flag execute_spin_lock_ = ATOMIC_FLAG_INIT
 
static mapd_shared_mutex execute_mutex_
 
static mapd_shared_mutex executors_cache_mutex_
 
static mapd_shared_mutex recycler_mutex_
 
static std::unordered_map
< std::string, size_t > 
cardinality_cache_
 

Friends

class BaselineJoinHashTable
 
class CodeGenerator
 
class ColumnFetcher
 
class ExecutionKernel
 
class HashJoin
 
class OverlapsJoinHashTable
 
class GroupByAndAggregate
 
class QueryCompilationDescriptor
 
class QueryMemoryDescriptor
 
class QueryMemoryInitializer
 
class QueryFragmentDescriptor
 
class QueryExecutionContext
 
class ResultSet
 
class InValuesBitmap
 
class LeafAggregator
 
class PerfectJoinHashTable
 
class QueryRewriter
 
class PendingExecutionClosure
 
class RelAlgExecutor
 
class TableOptimizer
 
class TableFunctionCompilationContext
 
class TableFunctionExecutionContext
 
struct TargetExprCodegenBuilder
 
struct TargetExprCodegen
 
class WindowProjectNodeContext
 

Detailed Description

Definition at line 367 of file Execute.h.

Member Typedef Documentation

using Executor::CachedCardinality = std::pair<bool, size_t>

Definition at line 965 of file Execute.h.

using Executor::ExecutorId = size_t

Definition at line 374 of file Execute.h.

Definition at line 540 of file Execute.h.

Constructor & Destructor Documentation

Executor::Executor ( const ExecutorId  id,
const size_t  block_size_x,
const size_t  grid_size_x,
const size_t  max_gpu_slab_size,
const std::string &  debug_dir,
const std::string &  debug_file 
)

Definition at line 137 of file Execute.cpp.

143  : cgen_state_(new CgenState({}, false))
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985

Member Function Documentation

void Executor::addCodeToCache ( const CodeCacheKey key,
std::shared_ptr< CompilationContext compilation_context,
llvm::Module *  module,
CodeCache cache 
)
static

Definition at line 385 of file NativeCodegen.cpp.

References LruCache< key_t, value_t, hash_t >::put().

Referenced by StubGenerator::generateStub().

388  {
389  cache.put(key,
390  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
391  std::move(compilation_context), std::move(module)));
392 }
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > Executor::addDeletedColumn ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 3247 of file Execute.cpp.

References catalog_(), CHECK, CHECK_EQ, CompilationOptions::filter_on_deleted_column, and TABLE.

3249  {
3250  if (!co.filter_on_deleted_column) {
3251  return std::make_tuple(ra_exe_unit, PlanState::DeletedColumnsMap{});
3252  }
3253  auto ra_exe_unit_with_deleted = ra_exe_unit;
3254  PlanState::DeletedColumnsMap deleted_cols_map;
3255  for (const auto& input_table : ra_exe_unit_with_deleted.input_descs) {
3256  if (input_table.getSourceType() != InputSourceType::TABLE) {
3257  continue;
3258  }
3259  const auto td = catalog_->getMetadataForTable(input_table.getTableId());
3260  CHECK(td);
3261  const auto deleted_cd = catalog_->getDeletedColumnIfRowsDeleted(td);
3262  if (!deleted_cd) {
3263  continue;
3264  }
3265  CHECK(deleted_cd->columnType.is_boolean());
3266  // check deleted column is not already present
3267  bool found = false;
3268  for (const auto& input_col : ra_exe_unit_with_deleted.input_col_descs) {
3269  if (input_col.get()->getColId() == deleted_cd->columnId &&
3270  input_col.get()->getScanDesc().getTableId() == deleted_cd->tableId &&
3271  input_col.get()->getScanDesc().getNestLevel() == input_table.getNestLevel()) {
3272  found = true;
3273  }
3274  }
3275  if (!found) {
3276  // add deleted column
3277  ra_exe_unit_with_deleted.input_col_descs.emplace_back(new InputColDescriptor(
3278  deleted_cd->columnId, deleted_cd->tableId, input_table.getNestLevel()));
3279  auto deleted_cols_it = deleted_cols_map.find(deleted_cd->tableId);
3280  if (deleted_cols_it == deleted_cols_map.end()) {
3281  CHECK(deleted_cols_map.insert(std::make_pair(deleted_cd->tableId, deleted_cd))
3282  .second);
3283  } else {
3284  CHECK_EQ(deleted_cd, deleted_cols_it->second);
3285  }
3286  }
3287  }
3288  return std::make_tuple(ra_exe_unit_with_deleted, deleted_cols_map);
3289 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
const ColumnDescriptor * getDeletedColumnIfRowsDeleted(const TableDescriptor *td) const
Definition: Catalog.cpp:3054
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
#define CHECK(condition)
Definition: Logger.h:197
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.

+ Here is the call graph for this function:

llvm::Value * Executor::addJoinLoopIterator ( const std::vector< llvm::Value * > &  prev_iters,
const size_t  level_idx 
)
private

Definition at line 635 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, and CgenState::scan_idx_to_hash_pos_.

636  {
638  // Iterators are added for loop-outer joins when the head of the loop is generated,
639  // then once again when the body if generated. Allow this instead of special handling
640  // of call sites.
641  const auto it = cgen_state_->scan_idx_to_hash_pos_.find(level_idx);
642  if (it != cgen_state_->scan_idx_to_hash_pos_.end()) {
643  return it->second;
644  }
645  CHECK(!prev_iters.empty());
646  llvm::Value* matching_row_index = prev_iters.back();
647  const auto it_ok =
648  cgen_state_->scan_idx_to_hash_pos_.emplace(level_idx, matching_row_index);
649  CHECK(it_ok.second);
650  return matching_row_index;
651 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:197
void Executor::addToCardinalityCache ( const std::string &  cache_key,
const size_t  cache_value 
)

Definition at line 3910 of file Execute.cpp.

References g_use_estimator_result_cache, and VLOG.

3911  {
3913  mapd_unique_lock<mapd_shared_mutex> lock(recycler_mutex_);
3914  cardinality_cache_[cache_key] = cache_value;
3915  VLOG(1) << "Put estimated cardinality to the cache";
3916  }
3917 }
static std::unordered_map< std::string, size_t > cardinality_cache_
Definition: Execute.h:1065
bool g_use_estimator_result_cache
Definition: Execute.cpp:111
static mapd_shared_mutex recycler_mutex_
Definition: Execute.h:1064
#define VLOG(n)
Definition: Logger.h:291
bool Executor::addToQuerySessionList ( const QuerySessionId query_session,
const std::string &  query_str,
const std::chrono::time_point< std::chrono::system_clock >  submitted,
const size_t  executor_id,
const QuerySessionStatus::QueryStatus  query_status,
mapd_unique_lock< mapd_shared_mutex > &  write_lock 
)

Definition at line 3749 of file Execute.cpp.

References toString().

3755  {
3756  // an internal API that enrolls the query session into the Executor's session map
3757  auto t_str = ::toString(submitted);
3758  if (queries_session_map_.count(query_session)) {
3759  if (queries_session_map_.at(query_session).count(t_str)) {
3760  queries_session_map_.at(query_session).erase(t_str);
3761  queries_session_map_.at(query_session)
3762  .emplace(t_str,
3764  query_session, executor_id, query_str, submitted, query_status));
3765  } else {
3766  queries_session_map_.at(query_session)
3767  .emplace(t_str,
3769  query_session, executor_id, query_str, submitted, query_status));
3770  }
3771  } else {
3772  std::map<std::string, QuerySessionStatus> executor_per_query_map;
3773  executor_per_query_map.emplace(
3774  t_str,
3776  query_session, executor_id, query_str, submitted, query_status));
3777  queries_session_map_.emplace(query_session, executor_per_query_map);
3778  }
3779  return queries_interrupt_flag_.emplace(query_session, false).second;
3780 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1052
std::string toString(const ExtArgumentType &sig_type)
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1050

+ Here is the call graph for this function:

llvm::Value * Executor::aggregateWindowStatePtr ( )
private

Definition at line 124 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), and kFLOAT.

124  {
126  const auto window_func_context =
128  const auto window_func = window_func_context->getWindowFunction();
129  const auto arg_ti = get_adjusted_window_type_info(window_func);
130  llvm::Type* aggregate_state_type =
131  arg_ti.get_type() == kFLOAT
132  ? llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0)
133  : llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
134  const auto aggregate_state_i64 = cgen_state_->llInt(
135  reinterpret_cast<const int64_t>(window_func_context->aggregateState()));
136  return cgen_state_->ir_builder_.CreateIntToPtr(aggregate_state_i64,
137  aggregate_state_type);
138 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

static size_t Executor::align ( const size_t  off_in,
const size_t  alignment 
)
inlinestaticprivate

Definition at line 977 of file Execute.h.

977  {
978  size_t off = off_in;
979  if (off % alignment != 0) {
980  off += (alignment - off % alignment);
981  }
982  return off;
983  }
CurrentQueryStatus Executor::attachExecutorToQuerySession ( std::shared_ptr< const query_state::QueryState > &  query_state)

Definition at line 3617 of file Execute.cpp.

References executor_id_().

3618  {
3619  QuerySessionId query_session_id = "";
3620  std::string query_str = "N/A";
3621  {
3622  mapd_shared_lock<mapd_shared_mutex> read_lock(executor_session_mutex_);
3623  // gather necessary query's info
3624  if (query_state != nullptr && query_state->getConstSessionInfo() != nullptr) {
3625  query_session_id = query_state->getConstSessionInfo()->get_session_id();
3626  query_str = query_state->getQueryStr();
3627  } else if (!getCurrentQuerySession(read_lock).empty()) {
3628  query_session_id = getCurrentQuerySession(read_lock);
3629  }
3630  }
3631  if (!query_session_id.empty()) {
3632  // if session is valid, do update 1) the exact executor id and 2) query status
3633  mapd_unique_lock<mapd_shared_mutex> write_lock(executor_session_mutex_);
3635  query_session_id, query_state->getQuerySubmittedTime(), executor_id_, write_lock);
3636  updateQuerySessionStatusWithLock(query_session_id,
3637  query_state->getQuerySubmittedTime(),
3638  QuerySessionStatus::QueryStatus::PENDING_EXECUTOR,
3639  write_lock);
3640  }
3641  return {query_session_id, query_str};
3642 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1044
bool updateQuerySessionExecutorAssignment(const QuerySessionId &query_session, const std::chrono::time_point< std::chrono::system_clock > submitted, const size_t executor_id, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:3810
QuerySessionId & getCurrentQuerySession(mapd_shared_lock< mapd_shared_mutex > &read_lock)
Definition: Execute.cpp:3594
const ExecutorId executor_id_
Definition: Execute.h:1028
bool updateQuerySessionStatusWithLock(const QuerySessionId &query_session, const std::chrono::time_point< std::chrono::system_clock > submitted, const QuerySessionStatus::QueryStatus updated_query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:3782
mapd_shared_lock< mapd_shared_mutex > read_lock
std::string QuerySessionId
Definition: Execute.h:78
mapd_unique_lock< mapd_shared_mutex > write_lock

+ Here is the call graph for this function:

unsigned Executor::blockSize ( ) const

Definition at line 3173 of file Execute.cpp.

References block_size_x_(), catalog_(), and CHECK.

3173  {
3174  CHECK(catalog_);
3175  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
3176  if (!cuda_mgr) {
3177  return 0;
3178  }
3179  const auto& dev_props = cuda_mgr->getAllDeviceProperties();
3180  return block_size_x_ ? block_size_x_ : dev_props.front().maxThreadsPerBlock;
3181 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:206
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:222
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
const unsigned block_size_x_
Definition: Execute.h:1022
#define CHECK(condition)
Definition: Logger.h:197
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:120

+ Here is the call graph for this function:

std::shared_ptr< HashJoin > Executor::buildCurrentLevelHashTable ( const JoinCondition current_level_join_conditions,
RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache,
std::vector< std::string > &  fail_reasons 
)
private

Definition at line 490 of file IRCodegen.cpp.

References anonymous_namespace{IRCodegen.cpp}::add_qualifier_to_execution_unit(), AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, Data_Namespace::CPU_LEVEL, CompilationOptions::device_type, Executor::JoinHashTableOrError::fail_reason, GPU, Data_Namespace::GPU_LEVEL, Executor::JoinHashTableOrError::hash_table, INNER, IS_EQUIVALENCE, PlanState::join_info_, OneToOne, CodeGenerator::plan_state_, JoinCondition::quals, and JoinCondition::type.

496  {
498  if (current_level_join_conditions.type != JoinType::INNER &&
499  current_level_join_conditions.quals.size() > 1) {
500  fail_reasons.emplace_back("No equijoin expression found for outer join");
501  return nullptr;
502  }
503  std::shared_ptr<HashJoin> current_level_hash_table;
504  for (const auto& join_qual : current_level_join_conditions.quals) {
505  auto qual_bin_oper = std::dynamic_pointer_cast<Analyzer::BinOper>(join_qual);
506  if (!qual_bin_oper || !IS_EQUIVALENCE(qual_bin_oper->get_optype())) {
507  fail_reasons.emplace_back("No equijoin expression found");
508  if (current_level_join_conditions.type == JoinType::INNER) {
509  add_qualifier_to_execution_unit(ra_exe_unit, join_qual);
510  }
511  continue;
512  }
513  JoinHashTableOrError hash_table_or_error;
514  if (!current_level_hash_table) {
515  hash_table_or_error = buildHashTableForQualifier(
516  qual_bin_oper,
517  query_infos,
521  column_cache);
522  current_level_hash_table = hash_table_or_error.hash_table;
523  }
524  if (hash_table_or_error.hash_table) {
525  plan_state_->join_info_.join_hash_tables_.push_back(hash_table_or_error.hash_table);
526  plan_state_->join_info_.equi_join_tautologies_.push_back(qual_bin_oper);
527  } else {
528  fail_reasons.push_back(hash_table_or_error.fail_reason);
529  if (current_level_join_conditions.type == JoinType::INNER) {
530  add_qualifier_to_execution_unit(ra_exe_unit, qual_bin_oper);
531  }
532  }
533  }
534  return current_level_hash_table;
535 }
#define IS_EQUIVALENCE(X)
Definition: sqldefs.h:67
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1000
void add_qualifier_to_execution_unit(RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< Analyzer::Expr > &qual)
Definition: IRCodegen.cpp:228
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
std::list< std::shared_ptr< Analyzer::Expr > > quals
JoinHashTableOrError buildHashTableForQualifier(const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const HashType preferred_hash_type, ColumnCacheMap &column_cache)
Definition: Execute.cpp:3118

+ Here is the call graph for this function:

Executor::JoinHashTableOrError Executor::buildHashTableForQualifier ( const std::shared_ptr< Analyzer::BinOper > &  qual_bin_oper,
const std::vector< InputTableInfo > &  query_infos,
const MemoryLevel  memory_level,
const HashType  preferred_hash_type,
ColumnCacheMap column_cache 
)
private

Definition at line 3118 of file Execute.cpp.

References g_enable_dynamic_watchdog, g_enable_overlaps_hashjoin, g_enable_runtime_query_interrupt, and HashJoin::getInstance().

3123  {
3124  if (!g_enable_overlaps_hashjoin && qual_bin_oper->is_overlaps_oper()) {
3125  return {nullptr, "Overlaps hash join disabled, attempting to fall back to loop join"};
3126  }
3127  // check whether the interrupt flag turns on (non kernel-time query interrupt)
3129  interrupted_.load()) {
3130  resetInterrupt();
3132  }
3133  try {
3134  auto tbl = HashJoin::getInstance(qual_bin_oper,
3135  query_infos,
3136  memory_level,
3137  preferred_hash_type,
3138  deviceCountForMemoryLevel(memory_level),
3139  column_cache,
3140  this);
3141  return {tbl, ""};
3142  } catch (const HashJoinFail& e) {
3143  return {nullptr, e.what()};
3144  }
3145 }
static std::shared_ptr< HashJoin > getInstance(const std::shared_ptr< Analyzer::BinOper > qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor)
Make hash table from an in-flight SQL query&#39;s parse tree etc.
Definition: HashJoin.cpp:218
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1076
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
bool g_enable_overlaps_hashjoin
Definition: Execute.cpp:94
void resetInterrupt()
static std::atomic< bool > interrupted_
Definition: Execute.h:1009
int deviceCountForMemoryLevel(const Data_Namespace::MemoryLevel memory_level) const
Definition: Execute.cpp:651
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:110

+ Here is the call graph for this function:

std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> Executor::buildIsDeletedCb ( const RelAlgExecutionUnit ra_exe_unit,
const size_t  level_idx,
const CompilationOptions co 
)
private

Definition at line 431 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_LT, CodeGenerator::codegen(), CgenState::context_, CgenState::current_func_, CompilationOptions::filter_on_deleted_column, PlanState::getDeletedColForTable(), RelAlgExecutionUnit::input_descs, CgenState::ir_builder_, CgenState::llBool(), CgenState::llInt(), CodeGenerator::plan_state_, TABLE, and CodeGenerator::toBool().

433  {
435  if (!co.filter_on_deleted_column) {
436  return nullptr;
437  }
438  CHECK_LT(level_idx + 1, ra_exe_unit.input_descs.size());
439  const auto input_desc = ra_exe_unit.input_descs[level_idx + 1];
440  if (input_desc.getSourceType() != InputSourceType::TABLE) {
441  return nullptr;
442  }
443 
444  const auto deleted_cd = plan_state_->getDeletedColForTable(input_desc.getTableId());
445  if (!deleted_cd) {
446  return nullptr;
447  }
448  CHECK(deleted_cd->columnType.is_boolean());
449  const auto deleted_expr = makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
450  input_desc.getTableId(),
451  deleted_cd->columnId,
452  input_desc.getNestLevel());
453  return [this, deleted_expr, level_idx, &co](const std::vector<llvm::Value*>& prev_iters,
454  llvm::Value* have_more_inner_rows) {
455  const auto matching_row_index = addJoinLoopIterator(prev_iters, level_idx + 1);
456  // Avoid fetching the deleted column from a position which is not valid.
457  // An invalid position can be returned by a one to one hash lookup (negative)
458  // or at the end of iteration over a set of matching values.
459  llvm::Value* is_valid_it{nullptr};
460  if (have_more_inner_rows) {
461  is_valid_it = have_more_inner_rows;
462  } else {
463  is_valid_it = cgen_state_->ir_builder_.CreateICmp(
464  llvm::ICmpInst::ICMP_SGE, matching_row_index, cgen_state_->llInt<int64_t>(0));
465  }
466  const auto it_valid_bb = llvm::BasicBlock::Create(
467  cgen_state_->context_, "it_valid", cgen_state_->current_func_);
468  const auto it_not_valid_bb = llvm::BasicBlock::Create(
469  cgen_state_->context_, "it_not_valid", cgen_state_->current_func_);
470  cgen_state_->ir_builder_.CreateCondBr(is_valid_it, it_valid_bb, it_not_valid_bb);
471  const auto row_is_deleted_bb = llvm::BasicBlock::Create(
472  cgen_state_->context_, "row_is_deleted", cgen_state_->current_func_);
473  cgen_state_->ir_builder_.SetInsertPoint(it_valid_bb);
474  CodeGenerator code_generator(this);
475  const auto row_is_deleted = code_generator.toBool(
476  code_generator.codegen(deleted_expr.get(), true, co).front());
477  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
478  cgen_state_->ir_builder_.SetInsertPoint(it_not_valid_bb);
479  const auto row_is_deleted_default = cgen_state_->llBool(false);
480  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
481  cgen_state_->ir_builder_.SetInsertPoint(row_is_deleted_bb);
482  auto row_is_deleted_or_default =
483  cgen_state_->ir_builder_.CreatePHI(row_is_deleted->getType(), 2);
484  row_is_deleted_or_default->addIncoming(row_is_deleted, it_valid_bb);
485  row_is_deleted_or_default->addIncoming(row_is_deleted_default, it_not_valid_bb);
486  return row_is_deleted_or_default;
487  };
488 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1000
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:207
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:635
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

std::vector< JoinLoop > Executor::buildJoinLoops ( RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache 
)
private

Definition at line 260 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_LT, CodeGenerator::codegen(), INJECT_TIMER, CgenState::ir_builder_, RelAlgExecutionUnit::join_quals, LEFT, CgenState::llBool(), OneToOne, CgenState::outer_join_match_found_per_level_, Set, Singleton, JoinLoopDomain::slot_lookup_result, CodeGenerator::toBool(), JoinCondition::type, and JoinLoopDomain::values_buffer.

265  {
268  std::vector<JoinLoop> join_loops;
269  for (size_t level_idx = 0, current_hash_table_idx = 0;
270  level_idx < ra_exe_unit.join_quals.size();
271  ++level_idx) {
272  const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
273  std::vector<std::string> fail_reasons;
274  const auto build_cur_level_hash_table = [&]() {
275  if (current_level_join_conditions.quals.size() > 1) {
276  const auto first_qual = *current_level_join_conditions.quals.begin();
277  auto qual_bin_oper =
278  std::dynamic_pointer_cast<const Analyzer::BinOper>(first_qual);
279  if (qual_bin_oper && qual_bin_oper->is_overlaps_oper() &&
280  current_level_join_conditions.type == JoinType::LEFT) {
281  JoinCondition join_condition{{first_qual}, current_level_join_conditions.type};
282 
284  join_condition, ra_exe_unit, co, query_infos, column_cache, fail_reasons);
285  }
286  }
287  return buildCurrentLevelHashTable(current_level_join_conditions,
288  ra_exe_unit,
289  co,
290  query_infos,
291  column_cache,
292  fail_reasons);
293  };
294  const auto current_level_hash_table = build_cur_level_hash_table();
295  const auto found_outer_join_matches_cb =
296  [this, level_idx](llvm::Value* found_outer_join_matches) {
297  CHECK_LT(level_idx, cgen_state_->outer_join_match_found_per_level_.size());
298  CHECK(!cgen_state_->outer_join_match_found_per_level_[level_idx]);
299  cgen_state_->outer_join_match_found_per_level_[level_idx] =
300  found_outer_join_matches;
301  };
302  const auto is_deleted_cb = buildIsDeletedCb(ra_exe_unit, level_idx, co);
303  const auto outer_join_condition_multi_quals_cb =
304  [this, level_idx, &co, &current_level_join_conditions](
305  const std::vector<llvm::Value*>& prev_iters) {
306  // The values generated for the match path don't dominate all uses
307  // since on the non-match path nulls are generated. Reset the cache
308  // once the condition is generated to avoid incorrect reuse.
309  FetchCacheAnchor anchor(cgen_state_.get());
310  addJoinLoopIterator(prev_iters, level_idx + 1);
311  llvm::Value* left_join_cond = cgen_state_->llBool(true);
312  CodeGenerator code_generator(this);
313  // Do not want to look at all quals! only 1..N quals (ignore first qual)
314  // Note(jclay): this may need to support cases larger than 2
315  // are there any?
316  if (current_level_join_conditions.quals.size() >= 2) {
317  auto qual_it = std::next(current_level_join_conditions.quals.begin(), 1);
318  for (; qual_it != current_level_join_conditions.quals.end();
319  std::advance(qual_it, 1)) {
320  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
321  left_join_cond,
322  code_generator.toBool(
323  code_generator.codegen(qual_it->get(), true, co).front()));
324  }
325  }
326  return left_join_cond;
327  };
328  if (current_level_hash_table) {
329  if (current_level_hash_table->getHashType() == HashType::OneToOne) {
330  join_loops.emplace_back(
331  /*kind=*/JoinLoopKind::Singleton,
332  /*type=*/current_level_join_conditions.type,
333  /*iteration_domain_codegen=*/
334  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
335  const std::vector<llvm::Value*>& prev_iters) {
336  addJoinLoopIterator(prev_iters, level_idx);
337  JoinLoopDomain domain{{0}};
338  domain.slot_lookup_result =
339  current_level_hash_table->codegenSlot(co, current_hash_table_idx);
340  return domain;
341  },
342  /*outer_condition_match=*/nullptr,
343  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
344  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
345  : nullptr,
346  /*is_deleted=*/is_deleted_cb);
347  } else {
348  join_loops.emplace_back(
349  /*kind=*/JoinLoopKind::Set,
350  /*type=*/current_level_join_conditions.type,
351  /*iteration_domain_codegen=*/
352  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
353  const std::vector<llvm::Value*>& prev_iters) {
354  addJoinLoopIterator(prev_iters, level_idx);
355  JoinLoopDomain domain{{0}};
356  const auto matching_set = current_level_hash_table->codegenMatchingSet(
357  co, current_hash_table_idx);
358  domain.values_buffer = matching_set.elements;
359  domain.element_count = matching_set.count;
360  return domain;
361  },
362  /*outer_condition_match=*/
363  current_level_join_conditions.type == JoinType::LEFT
364  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
365  outer_join_condition_multi_quals_cb)
366  : nullptr,
367  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
368  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
369  : nullptr,
370  /*is_deleted=*/is_deleted_cb);
371  }
372  ++current_hash_table_idx;
373  } else {
374  const auto fail_reasons_str = current_level_join_conditions.quals.empty()
375  ? "No equijoin expression found"
376  : boost::algorithm::join(fail_reasons, " | ");
378  ra_exe_unit, eo, query_infos, level_idx, fail_reasons_str);
379  // Callback provided to the `JoinLoop` framework to evaluate the (outer) join
380  // condition.
381  VLOG(1) << "Unable to build hash table, falling back to loop join: "
382  << fail_reasons_str;
383  const auto outer_join_condition_cb =
384  [this, level_idx, &co, &current_level_join_conditions](
385  const std::vector<llvm::Value*>& prev_iters) {
386  // The values generated for the match path don't dominate all uses
387  // since on the non-match path nulls are generated. Reset the cache
388  // once the condition is generated to avoid incorrect reuse.
389  FetchCacheAnchor anchor(cgen_state_.get());
390  addJoinLoopIterator(prev_iters, level_idx + 1);
391  llvm::Value* left_join_cond = cgen_state_->llBool(true);
392  CodeGenerator code_generator(this);
393  for (auto expr : current_level_join_conditions.quals) {
394  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
395  left_join_cond,
396  code_generator.toBool(
397  code_generator.codegen(expr.get(), true, co).front()));
398  }
399  return left_join_cond;
400  };
401  join_loops.emplace_back(
402  /*kind=*/JoinLoopKind::UpperBound,
403  /*type=*/current_level_join_conditions.type,
404  /*iteration_domain_codegen=*/
405  [this, level_idx](const std::vector<llvm::Value*>& prev_iters) {
406  addJoinLoopIterator(prev_iters, level_idx);
407  JoinLoopDomain domain{{0}};
408  const auto rows_per_scan_ptr = cgen_state_->ir_builder_.CreateGEP(
409  get_arg_by_name(cgen_state_->row_func_, "num_rows_per_scan"),
410  cgen_state_->llInt(int32_t(level_idx + 1)));
411  domain.upper_bound = cgen_state_->ir_builder_.CreateLoad(rows_per_scan_ptr,
412  "num_rows_per_scan");
413  return domain;
414  },
415  /*outer_condition_match=*/
416  current_level_join_conditions.type == JoinType::LEFT
417  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
418  outer_join_condition_cb)
419  : nullptr,
420  /*found_outer_matches=*/
421  current_level_join_conditions.type == JoinType::LEFT
422  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
423  : nullptr,
424  /*is_deleted=*/is_deleted_cb);
425  }
426  }
427  return join_loops;
428 }
std::shared_ptr< HashJoin > buildCurrentLevelHashTable(const JoinCondition &current_level_join_conditions, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
Definition: IRCodegen.cpp:490
llvm::Value * values_buffer
Definition: JoinLoop.h:48
std::string join(T const &container, std::string const &delim)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:162
#define INJECT_TIMER(DESC)
Definition: measure.h:93
const JoinQualsPerNestingLevel join_quals
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * slot_lookup_result
Definition: JoinLoop.h:46
#define CHECK_LT(x, y)
Definition: Logger.h:207
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:635
#define CHECK(condition)
Definition: Logger.h:197
void check_if_loop_join_is_allowed(RelAlgExecutionUnit &ra_exe_unit, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, const size_t level_idx, const std::string &fail_reason)
Definition: IRCodegen.cpp:238
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:260
std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> buildIsDeletedCb(const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
Definition: IRCodegen.cpp:431
#define VLOG(n)
Definition: Logger.h:291

+ Here is the call graph for this function:

void Executor::buildSelectedFragsMapping ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 2613 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LT, and RelAlgExecutionUnit::input_descs.

2618  {
2619  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
2620  size_t frag_pos{0};
2621  const auto& input_descs = ra_exe_unit.input_descs;
2622  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
2623  const int table_id = input_descs[scan_idx].getTableId();
2624  CHECK_EQ(selected_fragments[scan_idx].table_id, table_id);
2625  selected_fragments_crossjoin.push_back(
2626  getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
2627  for (const auto& col_id : col_global_ids) {
2628  CHECK(col_id);
2629  const auto& input_desc = col_id->getScanDesc();
2630  if (input_desc.getTableId() != table_id ||
2631  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
2632  continue;
2633  }
2634  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
2635  CHECK(it != plan_state_->global_to_local_col_ids_.end());
2636  CHECK_LT(static_cast<size_t>(it->second),
2637  plan_state_->global_to_local_col_ids_.size());
2638  local_col_to_frag_pos[it->second] = frag_pos;
2639  }
2640  ++frag_pos;
2641  }
2642 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1000
#define CHECK_LT(x, y)
Definition: Logger.h:207
std::vector< size_t > getFragmentCount(const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:2599
#define CHECK(condition)
Definition: Logger.h:197
void Executor::buildSelectedFragsMappingForUnion ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 2644 of file Execute.cpp.

References CHECK, CHECK_LT, and RelAlgExecutionUnit::input_descs.

2649  {
2650  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
2651  size_t frag_pos{0};
2652  const auto& input_descs = ra_exe_unit.input_descs;
2653  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
2654  const int table_id = input_descs[scan_idx].getTableId();
2655  // selected_fragments here is from assignFragsToKernelDispatch
2656  // execution_kernel.fragments
2657  if (selected_fragments[0].table_id != table_id) { // TODO 0
2658  continue;
2659  }
2660  // CHECK_EQ(selected_fragments[scan_idx].table_id, table_id);
2661  selected_fragments_crossjoin.push_back(
2662  // getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
2663  {size_t(1)}); // TODO
2664  for (const auto& col_id : col_global_ids) {
2665  CHECK(col_id);
2666  const auto& input_desc = col_id->getScanDesc();
2667  if (input_desc.getTableId() != table_id ||
2668  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
2669  continue;
2670  }
2671  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
2672  CHECK(it != plan_state_->global_to_local_col_ids_.end());
2673  CHECK_LT(static_cast<size_t>(it->second),
2674  plan_state_->global_to_local_col_ids_.size());
2675  local_col_to_frag_pos[it->second] = frag_pos;
2676  }
2677  ++frag_pos;
2678  }
2679 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1000
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK(condition)
Definition: Logger.h:197
llvm::Value * Executor::castToFP ( llvm::Value *  val)
private

Definition at line 3195 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, logger::FATAL, LOG, and to_string().

3195  {
3197  if (!val->getType()->isIntegerTy()) {
3198  return val;
3199  }
3200 
3201  auto val_width = static_cast<llvm::IntegerType*>(val->getType())->getBitWidth();
3202  llvm::Type* dest_ty{nullptr};
3203  switch (val_width) {
3204  case 32:
3205  dest_ty = llvm::Type::getFloatTy(cgen_state_->context_);
3206  break;
3207  case 64:
3208  dest_ty = llvm::Type::getDoubleTy(cgen_state_->context_);
3209  break;
3210  default:
3211  LOG(FATAL) << "Unsupported FP width: " << std::to_string(val_width);
3212  }
3213  return cgen_state_->ir_builder_.CreateSIToFP(val, dest_ty);
3214 }
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
std::string to_string(char const *&&v)
#define AUTOMATIC_IR_METADATA(CGENSTATE)

+ Here is the call graph for this function:

llvm::Value * Executor::castToIntPtrTyIn ( llvm::Value *  val,
const size_t  bit_width 
)
private

Definition at line 3216 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_LT, and get_int_type().

3216  {
3218  CHECK(val->getType()->isPointerTy());
3219 
3220  const auto val_ptr_type = static_cast<llvm::PointerType*>(val->getType());
3221  const auto val_type = val_ptr_type->getElementType();
3222  size_t val_width = 0;
3223  if (val_type->isIntegerTy()) {
3224  val_width = val_type->getIntegerBitWidth();
3225  } else {
3226  if (val_type->isFloatTy()) {
3227  val_width = 32;
3228  } else {
3229  CHECK(val_type->isDoubleTy());
3230  val_width = 64;
3231  }
3232  }
3233  CHECK_LT(size_t(0), val_width);
3234  if (bitWidth == val_width) {
3235  return val;
3236  }
3237  return cgen_state_->ir_builder_.CreateBitCast(
3238  val, llvm::PointerType::get(get_int_type(bitWidth, cgen_state_->context_), 0));
3239 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

bool Executor::checkCurrentQuerySession ( const std::string &  candidate_query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 3603 of file Execute.cpp.

3604  {
3605  // if current_query_session is equal to the candidate_query_session,
3606  // or it is empty session we consider
3607  return !candidate_query_session.empty() &&
3608  (current_query_session_ == candidate_query_session);
3609 }
static QuerySessionId current_query_session_
Definition: Execute.h:1046
bool Executor::checkIsQuerySessionEnrolled ( const QuerySessionId query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 3889 of file Execute.cpp.

3891  {
3892  return !query_session.empty() && queries_session_map_.count(query_session);
3893 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1052
bool Executor::checkIsQuerySessionInterrupted ( const QuerySessionId query_session,
mapd_shared_lock< mapd_shared_mutex > &  read_lock 
)

Definition at line 3881 of file Execute.cpp.

3883  {
3884  auto flag_it = queries_interrupt_flag_.find(query_session);
3885  return !query_session.empty() && flag_it != queries_interrupt_flag_.end() &&
3886  flag_it->second;
3887 }
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1050
void Executor::checkPendingQueryStatus ( const QuerySessionId query_session)

Definition at line 3644 of file Execute.cpp.

References ERR_INTERRUPTED, and VLOG.

3644  {
3645  // check whether we are okay to execute the "pending" query
3646  // i.e., before running the query check if this query session is "ALREADY" interrupted
3647  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
3648  if (query_session.empty()) {
3649  return;
3650  }
3651  if (queries_interrupt_flag_.find(query_session) == queries_interrupt_flag_.end()) {
3652  // something goes wrong since we assume this is caller's responsibility
3653  // (call this function only for enrolled query session)
3654  if (!queries_session_map_.count(query_session)) {
3655  VLOG(1) << "Interrupting pending query is not available since the query session is "
3656  "not enrolled";
3657  } else {
3658  // here the query session is enrolled but the interrupt flag is not registered
3659  VLOG(1)
3660  << "Interrupting pending query is not available since its interrupt flag is "
3661  "not registered";
3662  }
3663  return;
3664  }
3665  if (queries_interrupt_flag_[query_session]) {
3667  }
3668 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1044
static QuerySessionMap queries_session_map_
Definition: Execute.h:1052
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1076
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1050
#define VLOG(n)
Definition: Logger.h:291
void Executor::clearMemory ( const Data_Namespace::MemoryLevel  memory_level)
static

Definition at line 178 of file Execute.cpp.

References Data_Namespace::DataMgr::clearMemory(), Data_Namespace::CPU_LEVEL, Catalog_Namespace::SysCatalog::getDataMgr(), Data_Namespace::GPU_LEVEL, Catalog_Namespace::SysCatalog::instance(), and CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCaches().

Referenced by DBHandler::clear_cpu_memory(), DBHandler::clear_gpu_memory(), QueryRunner::QueryRunner::clearCpuMemory(), and QueryRunner::QueryRunner::clearGpuMemory().

178  {
179  switch (memory_level) {
182  mapd_unique_lock<mapd_shared_mutex> flush_lock(
183  execute_mutex_); // Don't flush memory while queries are running
184 
186  if (memory_level == Data_Namespace::MemoryLevel::CPU_LEVEL) {
187  // The hash table cache uses CPU memory not managed by the buffer manager. In the
188  // future, we should manage these allocations with the buffer manager directly.
189  // For now, assume the user wants to purge the hash table cache when they clear
190  // CPU memory (currently used in ExecuteTest to lower memory pressure)
192  }
193  break;
194  }
195  default: {
196  throw std::runtime_error(
197  "Clearing memory levels other than the CPU level or GPU level is not "
198  "supported.");
199  }
200  }
201 }
static mapd_shared_mutex execute_mutex_
Definition: Execute.h:1059
void clearMemory(const MemoryLevel memLevel)
Definition: DataMgr.cpp:385
static void invalidateCaches()
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:189
static SysCatalog & instance()
Definition: SysCatalog.h:288

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::clearMetaInfoCache ( )
private

Definition at line 373 of file Execute.cpp.

References input_table_info_cache_().

373  {
377 }
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:1042
InputTableInfoCache input_table_info_cache_
Definition: Execute.h:1041
TableGenerations table_generations_
Definition: Execute.h:1043

+ Here is the call graph for this function:

void Executor::clearQuerySessionStatus ( const QuerySessionId query_session,
const std::chrono::time_point< std::chrono::system_clock >  submitted,
bool  acquire_spin_lock 
)

Definition at line 3670 of file Execute.cpp.

References executor_id_().

3673  {
3674  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
3675  // clear the interrupt-related info for a finished query
3676  if (query_session.empty()) {
3677  return;
3678  }
3679  removeFromQuerySessionList(query_session, submitted, session_write_lock);
3680  if (query_session.compare(current_query_session_) == 0 &&
3682  invalidateRunningQuerySession(session_write_lock);
3683  if (acquire_spin_lock) {
3684  // for single-node execution, and dist mode uses external spin_lock
3685  // instead of executor's internal spin_lock
3686  execute_spin_lock_.clear(std::memory_order_release);
3687  resetInterrupt();
3688  }
3689  }
3690 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1044
void invalidateRunningQuerySession(mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:3611
static std::atomic_flag execute_spin_lock_
Definition: Execute.h:1055
const ExecutorId executor_id_
Definition: Execute.h:1028
bool removeFromQuerySessionList(const QuerySessionId &query_session, const std::chrono::time_point< std::chrono::system_clock > submitted, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:3834
static QuerySessionId current_query_session_
Definition: Execute.h:1046
void resetInterrupt()
static size_t running_query_executor_id_
Definition: Execute.h:1048

+ Here is the call graph for this function:

llvm::Value * Executor::codegenAggregateWindowState ( )
private

Definition at line 326 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), Analyzer::WindowFunction::getKind(), kDECIMAL, kDOUBLE, and kFLOAT.

326  {
328  const auto pi32_type =
329  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
330  const auto pi64_type =
331  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
332  const auto window_func_context =
334  const Analyzer::WindowFunction* window_func = window_func_context->getWindowFunction();
335  const auto window_func_ti = get_adjusted_window_type_info(window_func);
336  const auto aggregate_state_type =
337  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
338  auto aggregate_state = aggregateWindowStatePtr();
339  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
340  const auto aggregate_state_count_i64 = cgen_state_->llInt(
341  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
342  auto aggregate_state_count = cgen_state_->ir_builder_.CreateIntToPtr(
343  aggregate_state_count_i64, aggregate_state_type);
344  const auto double_null_lv = cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE));
345  switch (window_func_ti.get_type()) {
346  case kFLOAT: {
347  return cgen_state_->emitCall(
348  "load_avg_float", {aggregate_state, aggregate_state_count, double_null_lv});
349  }
350  case kDOUBLE: {
351  return cgen_state_->emitCall(
352  "load_avg_double", {aggregate_state, aggregate_state_count, double_null_lv});
353  }
354  case kDECIMAL: {
355  return cgen_state_->emitCall(
356  "load_avg_decimal",
357  {aggregate_state,
358  aggregate_state_count,
359  double_null_lv,
360  cgen_state_->llInt<int32_t>(window_func_ti.get_scale())});
361  }
362  default: {
363  return cgen_state_->emitCall(
364  "load_avg_int", {aggregate_state, aggregate_state_count, double_null_lv});
365  }
366  }
367  }
368  if (window_func->getKind() == SqlWindowFunctionKind::COUNT) {
369  return cgen_state_->ir_builder_.CreateLoad(aggregate_state);
370  }
371  switch (window_func_ti.get_type()) {
372  case kFLOAT: {
373  return cgen_state_->emitCall("load_float", {aggregate_state});
374  }
375  case kDOUBLE: {
376  return cgen_state_->emitCall("load_double", {aggregate_state});
377  }
378  default: {
379  return cgen_state_->ir_builder_.CreateLoad(aggregate_state);
380  }
381  }
382 }
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1447
llvm::Value * aggregateWindowStatePtr()
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

void Executor::codegenJoinLoops ( const std::vector< JoinLoop > &  join_loops,
const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
llvm::Function *  query_func,
llvm::BasicBlock *  entry_bb,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const ExecutionOptions eo 
)
private

Definition at line 653 of file IRCodegen.cpp.

References ExecutionOptions::allow_runtime_query_interrupt, AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, JoinLoop::codegen(), CgenState::context_, CgenState::current_func_, CompilationOptions::device_type, CgenState::ir_builder_, CgenState::llInt(), CgenState::needs_error_check_, CodeGenerator::posArg(), GroupByAndAggregate::query_infos_, and ExecutionOptions::with_dynamic_watchdog.

660  {
662  const auto exit_bb =
663  llvm::BasicBlock::Create(cgen_state_->context_, "exit", cgen_state_->current_func_);
664  cgen_state_->ir_builder_.SetInsertPoint(exit_bb);
665  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
666  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
667  CodeGenerator code_generator(this);
668  const auto loops_entry_bb = JoinLoop::codegen(
669  join_loops,
670  [this,
671  query_func,
672  &query_mem_desc,
673  &co,
674  &eo,
675  &group_by_and_aggregate,
676  &join_loops,
677  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
679  addJoinLoopIterator(prev_iters, join_loops.size());
680  auto& builder = cgen_state_->ir_builder_;
681  const auto loop_body_bb = llvm::BasicBlock::Create(
682  builder.getContext(), "loop_body", builder.GetInsertBlock()->getParent());
683  builder.SetInsertPoint(loop_body_bb);
684  const bool can_return_error =
685  compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
686  if (can_return_error || cgen_state_->needs_error_check_ ||
688  createErrorCheckControlFlow(query_func,
691  co.device_type,
692  group_by_and_aggregate.query_infos_);
693  }
694  return loop_body_bb;
695  },
696  code_generator.posArg(nullptr),
697  exit_bb,
698  cgen_state_.get());
699  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
700  cgen_state_->ir_builder_.CreateBr(loops_entry_bb);
701 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
const bool with_dynamic_watchdog
static llvm::BasicBlock * codegen(const std::vector< JoinLoop > &join_loops, const std::function< llvm::BasicBlock *(const std::vector< llvm::Value * > &)> &body_codegen, llvm::Value *outer_iter, llvm::BasicBlock *exit_bb, CgenState *cgen_state)
Definition: JoinLoop.cpp:46
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
const std::vector< InputTableInfo > & query_infos_
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:635
const bool allow_runtime_query_interrupt
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)

+ Here is the call graph for this function:

llvm::BasicBlock * Executor::codegenSkipDeletedOuterTableRow ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 2909 of file NativeCodegen.cpp.

2911  {
2913  if (!co.filter_on_deleted_column) {
2914  return nullptr;
2915  }
2916  CHECK(!ra_exe_unit.input_descs.empty());
2917  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2918  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2919  return nullptr;
2920  }
2921  const auto deleted_cd =
2922  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2923  if (!deleted_cd) {
2924  return nullptr;
2925  }
2926  CHECK(deleted_cd->columnType.is_boolean());
2927  const auto deleted_expr =
2928  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2929  outer_input_desc.getTableId(),
2930  deleted_cd->columnId,
2931  outer_input_desc.getNestLevel());
2932  CodeGenerator code_generator(this);
2933  const auto is_deleted =
2934  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2935  const auto is_deleted_bb = llvm::BasicBlock::Create(
2936  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2937  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2938  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2939  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2940  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2941  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2942  cgen_state_->ir_builder_.SetInsertPoint(bb);
2943  return bb;
2944 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1000
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:197
void Executor::codegenWindowAvgEpilogue ( llvm::Value *  crt_val,
llvm::Value *  window_func_null_val,
llvm::Value *  multiplicity_lv 
)
private

Definition at line 289 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

291  {
293  const auto window_func_context =
295  const auto window_func = window_func_context->getWindowFunction();
296  const auto window_func_ti = get_adjusted_window_type_info(window_func);
297  const auto pi32_type =
298  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
299  const auto pi64_type =
300  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
301  const auto aggregate_state_type =
302  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
303  const auto aggregate_state_count_i64 = cgen_state_->llInt(
304  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
305  auto aggregate_state_count = cgen_state_->ir_builder_.CreateIntToPtr(
306  aggregate_state_count_i64, aggregate_state_type);
307  std::string agg_count_func_name = "agg_count";
308  switch (window_func_ti.get_type()) {
309  case kFLOAT: {
310  agg_count_func_name += "_float";
311  break;
312  }
313  case kDOUBLE: {
314  agg_count_func_name += "_double";
315  break;
316  }
317  default: {
318  break;
319  }
320  }
321  agg_count_func_name += "_skip_val";
322  cgen_state_->emitCall(agg_count_func_name,
323  {aggregate_state_count, crt_val, window_func_null_val});
324 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunction ( const size_t  target_index,
const CompilationOptions co 
)
private

Definition at line 21 of file WindowFunctionIR.cpp.

References WindowProjectNodeContext::activateWindowFunctionContext(), run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, COUNT, CUME_DIST, DENSE_RANK, logger::FATAL, FIRST_VALUE, WindowProjectNodeContext::get(), WindowFunctionContext::getWindowFunction(), LAG, LAST_VALUE, LEAD, LOG, MAX, MIN, NTILE, PERCENT_RANK, RANK, ROW_NUMBER, and SUM.

22  {
24  CodeGenerator code_generator(this);
25  const auto window_func_context =
27  target_index);
28  const auto window_func = window_func_context->getWindowFunction();
29  switch (window_func->getKind()) {
34  return cgen_state_->emitCall("row_number_window_func",
35  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
36  window_func_context->output())),
37  code_generator.posArg(nullptr)});
38  }
41  return cgen_state_->emitCall("percent_window_func",
42  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
43  window_func_context->output())),
44  code_generator.posArg(nullptr)});
45  }
51  const auto& args = window_func->getArgs();
52  CHECK(!args.empty());
53  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
54  CHECK_EQ(arg_lvs.size(), size_t(1));
55  return arg_lvs.front();
56  }
63  }
64  default: {
65  LOG(FATAL) << "Invalid window function kind";
66  }
67  }
68  return nullptr;
69 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
static const WindowProjectNodeContext * get(Executor *executor)
const WindowFunctionContext * activateWindowFunctionContext(Executor *executor, const size_t target_index) const
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenWindowFunctionAggregate(const CompilationOptions &co)
#define CHECK(condition)
Definition: Logger.h:197
const Analyzer::WindowFunction * getWindowFunction() const

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregate ( const CompilationOptions co)
private

Definition at line 140 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, CHECK, WindowProjectNodeContext::get(), get_int_type(), and WindowProjectNodeContext::getActiveWindowFunctionContext().

140  {
142  const auto reset_state_false_bb = codegenWindowResetStateControlFlow();
143  auto aggregate_state = aggregateWindowStatePtr();
144  llvm::Value* aggregate_state_count = nullptr;
145  const auto window_func_context =
147  const auto window_func = window_func_context->getWindowFunction();
148  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
149  const auto aggregate_state_count_i64 = cgen_state_->llInt(
150  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
151  const auto pi64_type =
152  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
153  aggregate_state_count =
154  cgen_state_->ir_builder_.CreateIntToPtr(aggregate_state_count_i64, pi64_type);
155  }
156  codegenWindowFunctionStateInit(aggregate_state);
157  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
158  const auto count_zero = cgen_state_->llInt(int64_t(0));
159  cgen_state_->emitCall("agg_id", {aggregate_state_count, count_zero});
160  }
161  cgen_state_->ir_builder_.CreateBr(reset_state_false_bb);
162  cgen_state_->ir_builder_.SetInsertPoint(reset_state_false_bb);
164  return codegenWindowFunctionAggregateCalls(aggregate_state, co);
165 }
llvm::Value * aggregateWindowStatePtr()
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
static const WindowProjectNodeContext * get(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
void codegenWindowFunctionStateInit(llvm::Value *aggregate_state)
#define CHECK(condition)
Definition: Logger.h:197
llvm::Value * codegenWindowFunctionAggregateCalls(llvm::Value *aggregate_state, const CompilationOptions &co)
llvm::BasicBlock * codegenWindowResetStateControlFlow()

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregateCalls ( llvm::Value *  aggregate_state,
const CompilationOptions co 
)
private

Definition at line 246 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, CodeGenerator::codegen(), CodeGenerator::codegenCastBetweenIntTypes(), COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), anonymous_namespace{WindowFunctionIR.cpp}::get_window_agg_name(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kFLOAT, and SUM.

247  {
249  const auto window_func_context =
251  const auto window_func = window_func_context->getWindowFunction();
252  const auto window_func_ti = get_adjusted_window_type_info(window_func);
253  const auto window_func_null_val =
254  window_func_ti.is_fp()
255  ? cgen_state_->inlineFpNull(window_func_ti)
256  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
257  const auto& args = window_func->getArgs();
258  llvm::Value* crt_val;
259  if (args.empty()) {
260  CHECK(window_func->getKind() == SqlWindowFunctionKind::COUNT);
261  crt_val = cgen_state_->llInt(int64_t(1));
262  } else {
263  CodeGenerator code_generator(this);
264  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
265  CHECK_EQ(arg_lvs.size(), size_t(1));
266  if (window_func->getKind() == SqlWindowFunctionKind::SUM && !window_func_ti.is_fp()) {
267  crt_val = code_generator.codegenCastBetweenIntTypes(
268  arg_lvs.front(), args.front()->get_type_info(), window_func_ti, false);
269  } else {
270  crt_val = window_func_ti.get_type() == kFLOAT
271  ? arg_lvs.front()
272  : cgen_state_->castToTypeIn(arg_lvs.front(), 64);
273  }
274  }
275  const auto agg_name = get_window_agg_name(window_func->getKind(), window_func_ti);
276  llvm::Value* multiplicity_lv = nullptr;
277  if (args.empty()) {
278  cgen_state_->emitCall(agg_name, {aggregate_state, crt_val});
279  } else {
280  cgen_state_->emitCall(agg_name + "_skip_val",
281  {aggregate_state, crt_val, window_func_null_val});
282  }
283  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
284  codegenWindowAvgEpilogue(crt_val, window_func_null_val, multiplicity_lv);
285  }
287 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string get_window_agg_name(const SqlWindowFunctionKind kind, const SQLTypeInfo &window_func_ti)
void codegenWindowAvgEpilogue(llvm::Value *crt_val, llvm::Value *window_func_null_val, llvm::Value *multiplicity_lv)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenAggregateWindowState()
#define CHECK(condition)
Definition: Logger.h:197
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

void Executor::codegenWindowFunctionStateInit ( llvm::Value *  aggregate_state)
private

Definition at line 196 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, COUNT, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

196  {
198  const auto window_func_context =
200  const auto window_func = window_func_context->getWindowFunction();
201  const auto window_func_ti = get_adjusted_window_type_info(window_func);
202  const auto window_func_null_val =
203  window_func_ti.is_fp()
204  ? cgen_state_->inlineFpNull(window_func_ti)
205  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
206  llvm::Value* window_func_init_val;
207  if (window_func_context->getWindowFunction()->getKind() ==
209  switch (window_func_ti.get_type()) {
210  case kFLOAT: {
211  window_func_init_val = cgen_state_->llFp(float(0));
212  break;
213  }
214  case kDOUBLE: {
215  window_func_init_val = cgen_state_->llFp(double(0));
216  break;
217  }
218  default: {
219  window_func_init_val = cgen_state_->llInt(int64_t(0));
220  break;
221  }
222  }
223  } else {
224  window_func_init_val = window_func_null_val;
225  }
226  const auto pi32_type =
227  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
228  switch (window_func_ti.get_type()) {
229  case kDOUBLE: {
230  cgen_state_->emitCall("agg_id_double", {aggregate_state, window_func_init_val});
231  break;
232  }
233  case kFLOAT: {
234  aggregate_state =
235  cgen_state_->ir_builder_.CreateBitCast(aggregate_state, pi32_type);
236  cgen_state_->emitCall("agg_id_float", {aggregate_state, window_func_init_val});
237  break;
238  }
239  default: {
240  cgen_state_->emitCall("agg_id", {aggregate_state, window_func_init_val});
241  break;
242  }
243  }
244 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

llvm::BasicBlock * Executor::codegenWindowResetStateControlFlow ( )
private

Definition at line 167 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, WindowProjectNodeContext::getActiveWindowFunctionContext(), CodeGenerator::posArg(), and CodeGenerator::toBool().

167  {
169  const auto window_func_context =
171  const auto bitset = cgen_state_->llInt(
172  reinterpret_cast<const int64_t>(window_func_context->partitionStart()));
173  const auto min_val = cgen_state_->llInt(int64_t(0));
174  const auto max_val = cgen_state_->llInt(window_func_context->elementCount() - 1);
175  const auto null_val = cgen_state_->llInt(inline_int_null_value<int64_t>());
176  const auto null_bool_val = cgen_state_->llInt<int8_t>(inline_int_null_value<int8_t>());
177  CodeGenerator code_generator(this);
178  const auto reset_state =
179  code_generator.toBool(cgen_state_->emitCall("bit_is_set",
180  {bitset,
181  code_generator.posArg(nullptr),
182  min_val,
183  max_val,
184  null_val,
185  null_bool_val}));
186  const auto reset_state_true_bb = llvm::BasicBlock::Create(
187  cgen_state_->context_, "reset_state.true", cgen_state_->current_func_);
188  const auto reset_state_false_bb = llvm::BasicBlock::Create(
189  cgen_state_->context_, "reset_state.false", cgen_state_->current_func_);
190  cgen_state_->ir_builder_.CreateCondBr(
191  reset_state, reset_state_true_bb, reset_state_false_bb);
192  cgen_state_->ir_builder_.SetInsertPoint(reset_state_true_bb);
193  return reset_state_false_bb;
194 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)

+ Here is the call graph for this function:

ResultSetPtr Executor::collectAllDeviceResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner 
)
private

Definition at line 1797 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::build_row_for_empty_input(), catalog_(), DEBUG_TIMER, SharedKernelContext::getFragmentResults(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, NonGroupedAggregate, GroupByAndAggregate::shard_count_for_top_groups(), RelAlgExecutionUnit::target_exprs, and use_speculative_top_n().

1802  {
1803  auto timer = DEBUG_TIMER(__func__);
1804  auto& result_per_device = shared_context.getFragmentResults();
1805  if (result_per_device.empty() && query_mem_desc.getQueryDescriptionType() ==
1808  ra_exe_unit.target_exprs, query_mem_desc, device_type);
1809  }
1810  if (use_speculative_top_n(ra_exe_unit, query_mem_desc)) {
1811  try {
1812  return reduceSpeculativeTopN(
1813  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
1814  } catch (const std::bad_alloc&) {
1815  throw SpeculativeTopNFailed("Failed during multi-device reduction.");
1816  }
1817  }
1818  const auto shard_count =
1819  device_type == ExecutorDeviceType::GPU
1821  : 0;
1822 
1823  if (shard_count && !result_per_device.empty()) {
1824  return collectAllDeviceShardedTopResults(shared_context, ra_exe_unit);
1825  }
1826  return reduceMultiDeviceResults(
1827  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
1828 }
std::vector< Analyzer::Expr * > target_exprs
bool use_speculative_top_n(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc)
ResultSetPtr reduceSpeculativeTopN(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:1003
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
ResultSetPtr reduceMultiDeviceResults(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:896
ResultSetPtr collectAllDeviceShardedTopResults(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit) const
Definition: Execute.cpp:1912
QueryDescriptionType getQueryDescriptionType() const
ResultSetPtr build_row_for_empty_input(const std::vector< Analyzer::Expr * > &target_exprs_in, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
Definition: Execute.cpp:1755
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define DEBUG_TIMER(name)
Definition: Logger.h:313
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)

+ Here is the call graph for this function:

ResultSetPtr Executor::collectAllDeviceShardedTopResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit 
) const
private

Definition at line 1912 of file Execute.cpp.

References catalog_(), CHECK, CHECK_EQ, CHECK_LE, SharedKernelContext::getFragmentResults(), SortInfo::limit, SortInfo::offset, SortInfo::order_entries, anonymous_namespace{Execute.cpp}::permute_storage_columnar(), anonymous_namespace{Execute.cpp}::permute_storage_row_wise(), run_benchmark_import::result, and RelAlgExecutionUnit::sort_info.

1914  {
1915  auto& result_per_device = shared_context.getFragmentResults();
1916  const auto first_result_set = result_per_device.front().first;
1917  CHECK(first_result_set);
1918  auto top_query_mem_desc = first_result_set->getQueryMemDesc();
1919  CHECK(!top_query_mem_desc.hasInterleavedBinsOnGpu());
1920  const auto top_n = ra_exe_unit.sort_info.limit + ra_exe_unit.sort_info.offset;
1921  top_query_mem_desc.setEntryCount(0);
1922  for (auto& result : result_per_device) {
1923  const auto result_set = result.first;
1924  CHECK(result_set);
1925  result_set->sort(ra_exe_unit.sort_info.order_entries, top_n, this);
1926  size_t new_entry_cnt = top_query_mem_desc.getEntryCount() + result_set->rowCount();
1927  top_query_mem_desc.setEntryCount(new_entry_cnt);
1928  }
1929  auto top_result_set = std::make_shared<ResultSet>(first_result_set->getTargetInfos(),
1930  first_result_set->getDeviceType(),
1931  top_query_mem_desc,
1932  first_result_set->getRowSetMemOwner(),
1933  catalog_,
1934  blockSize(),
1935  gridSize());
1936  auto top_storage = top_result_set->allocateStorage();
1937  size_t top_output_row_idx{0};
1938  for (auto& result : result_per_device) {
1939  const auto result_set = result.first;
1940  CHECK(result_set);
1941  const auto& top_permutation = result_set->getPermutationBuffer();
1942  CHECK_LE(top_permutation.size(), top_n);
1943  if (top_query_mem_desc.didOutputColumnar()) {
1944  top_output_row_idx = permute_storage_columnar(result_set->getStorage(),
1945  result_set->getQueryMemDesc(),
1946  top_storage,
1947  top_output_row_idx,
1948  top_query_mem_desc,
1949  top_permutation);
1950  } else {
1951  top_output_row_idx = permute_storage_row_wise(result_set->getStorage(),
1952  top_storage,
1953  top_output_row_idx,
1954  top_query_mem_desc,
1955  top_permutation);
1956  }
1957  }
1958  CHECK_EQ(top_output_row_idx, top_query_mem_desc.getEntryCount());
1959  return top_result_set;
1960 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const std::list< Analyzer::OrderEntry > order_entries
size_t permute_storage_row_wise(const ResultSetStorage *input_storage, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:1891
const size_t limit
const SortInfo sort_info
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
#define CHECK_LE(x, y)
Definition: Logger.h:208
unsigned gridSize() const
Definition: Execute.cpp:3156
size_t permute_storage_columnar(const ResultSetStorage *input_storage, const QueryMemoryDescriptor &input_query_mem_desc, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:1841
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define CHECK(condition)
Definition: Logger.h:197
unsigned blockSize() const
Definition: Execute.cpp:3173
const size_t offset

+ Here is the call graph for this function:

bool Executor::compileBody ( const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const GpuSharedMemoryContext gpu_smem_context = {} 
)
private

Definition at line 2946 of file NativeCodegen.cpp.

2950  {
2952 
2953  // Switch the code generation into a separate filter function if enabled.
2954  // Note that accesses to function arguments are still codegenned from the
2955  // row function's arguments, then later automatically forwarded and
2956  // remapped into filter function arguments by redeclareFilterFunction().
2957  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2958  llvm::Value* loop_done{nullptr};
2959  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2960  if (cgen_state_->filter_func_) {
2961  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2962  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2963  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2964  row_func_entry_bb->begin());
2965  loop_done = cgen_state_->ir_builder_.CreateAlloca(
2966  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
2967  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2968  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
2969  }
2970  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2971  cgen_state_->current_func_ = cgen_state_->filter_func_;
2972  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2973  }
2974 
2975  // generate the code for the filter
2976  std::vector<Analyzer::Expr*> primary_quals;
2977  std::vector<Analyzer::Expr*> deferred_quals;
2978  bool short_circuited =
2979  CodeGenerator::prioritizeQuals(ra_exe_unit, primary_quals, deferred_quals);
2980  if (short_circuited) {
2981  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
2982  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
2983  << " quals";
2984  }
2985  llvm::Value* filter_lv = cgen_state_->llBool(true);
2986  CodeGenerator code_generator(this);
2987  for (auto expr : primary_quals) {
2988  // Generate the filter for primary quals
2989  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
2990  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2991  }
2992  CHECK(filter_lv->getType()->isIntegerTy(1));
2993  llvm::BasicBlock* sc_false{nullptr};
2994  if (!deferred_quals.empty()) {
2995  auto sc_true = llvm::BasicBlock::Create(
2996  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
2997  sc_false = llvm::BasicBlock::Create(
2998  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
2999  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3000  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3001  if (ra_exe_unit.join_quals.empty()) {
3002  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3003  }
3004  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3005  filter_lv = cgen_state_->llBool(true);
3006  }
3007  for (auto expr : deferred_quals) {
3008  filter_lv = cgen_state_->ir_builder_.CreateAnd(
3009  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
3010  }
3011 
3012  CHECK(filter_lv->getType()->isIntegerTy(1));
3013  auto ret = group_by_and_aggregate.codegen(
3014  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3015 
3016  // Switch the code generation back to the row function if a filter
3017  // function was enabled.
3018  if (cgen_state_->filter_func_) {
3019  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3020  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
3021  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3022  }
3023 
3025 
3026  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3027  cgen_state_->current_func_ = cgen_state_->row_func_;
3028  cgen_state_->filter_func_call_ =
3029  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3030 
3031  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3032  auto loop_done_true = llvm::BasicBlock::Create(
3033  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
3034  auto loop_done_false = llvm::BasicBlock::Create(
3035  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
3036  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
3037  cgen_state_->ir_builder_.CreateCondBr(
3038  loop_done_flag, loop_done_true, loop_done_false);
3039  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3040  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3041  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3042  } else {
3043  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3044  }
3045  }
3046  return ret;
3047 }
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals)
Definition: LogicalIR.cpp:157
std::string to_string(char const *&&v)
const JoinQualsPerNestingLevel join_quals
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:197
void redeclareFilterFunction()
Definition: IRCodegen.cpp:537
#define VLOG(n)
Definition: Logger.h:291
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > Executor::compileWorkUnit ( const std::vector< InputTableInfo > &  query_infos,
const PlanState::DeletedColumnsMap deleted_cols_map,
const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const CudaMgr_Namespace::CudaMgr cuda_mgr,
const bool  allow_lazy_fetch,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const size_t  max_groups_buffer_entry_count,
const int8_t  crt_min_byte_width,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache,
RenderInfo render_info = nullptr 
)
private

Definition at line 2447 of file NativeCodegen.cpp.

2459  {
2460  auto timer = DEBUG_TIMER(__func__);
2461 
2462 #ifndef NDEBUG
2463  static std::uint64_t counter = 0;
2464  ++counter;
2465  VLOG(1) << "CODEGEN #" << counter << ":";
2466  LOG(IR) << "CODEGEN #" << counter << ":";
2467  LOG(PTX) << "CODEGEN #" << counter << ":";
2468  LOG(ASM) << "CODEGEN #" << counter << ":";
2469 #endif
2470 
2471  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2472 
2473  GroupByAndAggregate group_by_and_aggregate(
2474  this,
2475  co.device_type,
2476  ra_exe_unit,
2477  query_infos,
2478  row_set_mem_owner,
2479  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2480  : std::nullopt);
2481  auto query_mem_desc =
2482  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2483  max_groups_buffer_entry_guess,
2484  crt_min_byte_width,
2485  render_info,
2487 
2488  if (query_mem_desc->getQueryDescriptionType() ==
2490  !has_cardinality_estimation &&
2491  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2492  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2493  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2494  }
2495 
2496  const bool output_columnar = query_mem_desc->didOutputColumnar();
2497  const bool gpu_shared_mem_optimization =
2499  ra_exe_unit,
2500  cuda_mgr,
2501  co.device_type,
2502  cuda_mgr ? this->blockSize() : 1,
2503  cuda_mgr ? this->numBlocksPerMP() : 1);
2504  if (gpu_shared_mem_optimization) {
2505  // disable interleaved bins optimization on the GPU
2506  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2507  LOG(DEBUG1) << "GPU shared memory is used for the " +
2508  query_mem_desc->queryDescTypeToString() + " query(" +
2509  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2510  query_mem_desc.get())) +
2511  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2512  }
2513 
2514  const GpuSharedMemoryContext gpu_smem_context(
2515  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2516 
2518  const size_t num_count_distinct_descs =
2519  query_mem_desc->getCountDistinctDescriptorsSize();
2520  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2521  const auto& count_distinct_descriptor =
2522  query_mem_desc->getCountDistinctDescriptor(i);
2523  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2524  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2525  !co.hoist_literals)) {
2526  throw QueryMustRunOnCpu();
2527  }
2528  }
2529  }
2530 
2531  // Read the module template and target either CPU or GPU
2532  // by binding the stream position functions to the right implementation:
2533  // stride access for GPU, contiguous for CPU
2534  auto rt_module_copy = llvm::CloneModule(
2535  *g_rt_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
2536  auto func = llvm::dyn_cast<llvm::Function>(gv);
2537  if (!func) {
2538  return true;
2539  }
2540  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2541  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2543  });
2545  if (is_udf_module_present(true)) {
2547  }
2548  if (is_rt_udf_module_present(true)) {
2550  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2551  }
2552  } else {
2553  rt_module_copy->setDataLayout(get_gpu_data_layout());
2554  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2555  if (is_udf_module_present()) {
2557  }
2558  if (is_rt_udf_module_present()) {
2560  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2561  }
2562  }
2563 
2564  cgen_state_->module_ = rt_module_copy.release();
2566 
2567  auto agg_fnames =
2568  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2569 
2570  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2571 
2572  const bool is_group_by{query_mem_desc->isGroupBy()};
2573  auto [query_func, row_func_call] = is_group_by
2575  co.hoist_literals,
2576  *query_mem_desc,
2577  co.device_type,
2578  ra_exe_unit.scan_limit,
2579  gpu_smem_context)
2580  : query_template(cgen_state_->module_,
2581  agg_slot_count,
2582  co.hoist_literals,
2583  !!ra_exe_unit.estimator,
2584  gpu_smem_context);
2585  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2586  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2587  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2588 
2589  cgen_state_->query_func_ = query_func;
2590  cgen_state_->row_func_call_ = row_func_call;
2591  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2592  &query_func->getEntryBlock().front());
2593 
2594  // Generate the function signature and column head fetches s.t.
2595  // double indirection isn't needed in the inner loop
2596  auto& fetch_bb = query_func->front();
2597  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2598  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2599  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2600  query_func->args().begin(),
2601  fetch_ir_builder,
2602  cgen_state_->context_);
2603  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2604 
2605  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2606  is_group_by ? 0 : agg_slot_count,
2607  co.hoist_literals,
2608  cgen_state_->module_,
2609  cgen_state_->context_);
2610  CHECK(cgen_state_->row_func_);
2611  cgen_state_->row_func_bb_ =
2612  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2613 
2615  auto filter_func_ft =
2616  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2617  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2618  llvm::Function::ExternalLinkage,
2619  "filter_func",
2620  cgen_state_->module_);
2621  CHECK(cgen_state_->filter_func_);
2622  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2623  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2624  }
2625 
2626  cgen_state_->current_func_ = cgen_state_->row_func_;
2627  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2628 
2629  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2630  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2631  const auto join_loops =
2632  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2633 
2634  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2635  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2636  if (is_not_deleted_bb) {
2637  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2638  }
2639  if (!join_loops.empty()) {
2640  codegenJoinLoops(join_loops,
2641  body_execution_unit,
2642  group_by_and_aggregate,
2643  query_func,
2644  cgen_state_->row_func_bb_,
2645  *(query_mem_desc.get()),
2646  co,
2647  eo);
2648  } else {
2649  const bool can_return_error = compileBody(
2650  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2651  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2653  createErrorCheckControlFlow(query_func,
2656  co.device_type,
2657  group_by_and_aggregate.query_infos_);
2658  }
2659  }
2660  std::vector<llvm::Value*> hoisted_literals;
2661 
2662  if (co.hoist_literals) {
2663  VLOG(1) << "number of hoisted literals: "
2664  << cgen_state_->query_func_literal_loads_.size()
2665  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2666  << " bytes";
2667  }
2668 
2669  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2670  // we have some hoisted literals...
2671  hoisted_literals = inlineHoistedLiterals();
2672  }
2673 
2674  // replace the row func placeholder call with the call to the actual row func
2675  std::vector<llvm::Value*> row_func_args;
2676  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2677  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2678  }
2679  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2680  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2681  // push hoisted literals arguments, if any
2682  row_func_args.insert(
2683  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2684  llvm::ReplaceInstWithInst(
2685  cgen_state_->row_func_call_,
2686  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2687 
2688  // replace the filter func placeholder call with the call to the actual filter func
2689  if (cgen_state_->filter_func_) {
2690  std::vector<llvm::Value*> filter_func_args;
2691  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2692  arg_it != cgen_state_->filter_func_args_.end();
2693  ++arg_it) {
2694  filter_func_args.push_back(arg_it->first);
2695  }
2696  llvm::ReplaceInstWithInst(
2697  cgen_state_->filter_func_call_,
2698  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2699  }
2700 
2701  // Aggregate
2702  plan_state_->init_agg_vals_ =
2703  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2704 
2705  /*
2706  * If we have decided to use GPU shared memory (decision is not made here), then
2707  * we generate proper code for extra components that it needs (buffer initialization and
2708  * gpu reduction from shared memory to global memory). We then replace these functions
2709  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2710  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2711  */
2712  if (gpu_smem_context.isSharedMemoryUsed()) {
2713  if (query_mem_desc->getQueryDescriptionType() ==
2715  GpuSharedMemCodeBuilder gpu_smem_code(
2716  cgen_state_->module_,
2717  cgen_state_->context_,
2718  *query_mem_desc,
2720  plan_state_->init_agg_vals_);
2721  gpu_smem_code.codegen();
2722  gpu_smem_code.injectFunctionsInto(query_func);
2723 
2724  // helper functions are used for caching purposes later
2725  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2726  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2727  LOG(IR) << gpu_smem_code.toString();
2728  }
2729  }
2730 
2731  auto multifrag_query_func = cgen_state_->module_->getFunction(
2732  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2733  CHECK(multifrag_query_func);
2734 
2736  insertErrorCodeChecker(multifrag_query_func, co.hoist_literals);
2737  }
2738 
2739  bind_query(query_func,
2740  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2741  multifrag_query_func,
2742  cgen_state_->module_);
2743 
2744  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2745  if (cgen_state_->filter_func_) {
2746  root_funcs.push_back(cgen_state_->filter_func_);
2747  }
2748  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2749  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2750 
2751  // Always inline the row function and the filter function.
2752  // We don't want register spills in the inner loops.
2753  // LLVM seems to correctly free up alloca instructions
2754  // in these functions even when they are inlined.
2756  if (cgen_state_->filter_func_) {
2758  }
2759 
2760 #ifndef NDEBUG
2761  // Add helpful metadata to the LLVM IR for debugging.
2763 #endif
2764 
2765  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2766  std::string llvm_ir;
2767  if (eo.just_explain) {
2769 #ifdef WITH_JIT_DEBUG
2770  throw std::runtime_error(
2771  "Explain optimized not available when JIT runtime debug symbols are enabled");
2772 #else
2773  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2774  // optimized IR after NVVM reflect
2775  llvm::legacy::PassManager pass_manager;
2776  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2777 #endif // WITH_JIT_DEBUG
2778  }
2779  llvm_ir =
2780  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2781  serialize_llvm_object(cgen_state_->row_func_) +
2782  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2783  : "");
2784 
2785 #ifndef NDEBUG
2786  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2787 #endif
2788  }
2789 
2790  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2791  LOG(IR) << "IR for the "
2792  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2793 #ifdef NDEBUG
2794  LOG(IR) << serialize_llvm_object(query_func)
2795  << serialize_llvm_object(cgen_state_->row_func_)
2796  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2797  : "")
2798  << "\nEnd of IR";
2799 #else
2800  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2801 #endif
2802 
2803  // Run some basic validation checks on the LLVM IR before code is generated below.
2804  verify_function_ir(cgen_state_->row_func_);
2805  if (cgen_state_->filter_func_) {
2806  verify_function_ir(cgen_state_->filter_func_);
2807  }
2808 
2809  // Generate final native code from the LLVM IR.
2810  return std::make_tuple(
2813  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2814  : optimizeAndCodegenGPU(query_func,
2815  multifrag_query_func,
2816  live_funcs,
2817  is_group_by || ra_exe_unit.estimator,
2818  cuda_mgr,
2819  co),
2820  cgen_state_->getLiterals(),
2821  output_columnar,
2822  llvm_ir,
2823  std::move(gpu_smem_context)},
2824  std::move(query_mem_desc));
2825 }
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::unique_ptr< llvm::Module > rt_udf_cpu_module
void codegenJoinLoops(const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
Definition: IRCodegen.cpp:653
std::unique_ptr< llvm::Module > udf_gpu_module
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool is_udf_module_present(bool cpu_only=false)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
unsigned numBlocksPerMP() const
Definition: Execute.cpp:3165
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
void preloadFragOffsets(const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
Definition: Execute.cpp:3097
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals)
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
llvm::StringRef get_gpu_target_triple_string()
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
void verify_function_ir(const llvm::Function *func)
const bool allow_multifrag
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:162
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
const bool with_dynamic_watchdog
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1000
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
std::string serialize_llvm_object(const T *llvm_obj)
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
bool g_enable_filter_function
Definition: Execute.cpp:79
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
void nukeOldState(const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
Definition: Execute.cpp:3078
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:260
unsigned blockSize() const
Definition: Execute.cpp:3173
const bool allow_runtime_query_interrupt
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool is_rt_udf_module_present(bool cpu_only=false)
#define VLOG(n)
Definition: Logger.h:291
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:114
AggregatedColRange Executor::computeColRangesCache ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 3521 of file Execute.cpp.

References catalog_(), CHECK, getLeafColumnRange(), AggregatedColRange::setColRange(), and ExpressionRange::typeSupportsRange().

3522  {
3523  AggregatedColRange agg_col_range_cache;
3524  CHECK(catalog_);
3525  std::unordered_set<int> phys_table_ids;
3526  for (const auto& phys_input : phys_inputs) {
3527  phys_table_ids.insert(phys_input.table_id);
3528  }
3529  std::vector<InputTableInfo> query_infos;
3530  for (const int table_id : phys_table_ids) {
3531  query_infos.emplace_back(InputTableInfo{table_id, getTableInfo(table_id)});
3532  }
3533  for (const auto& phys_input : phys_inputs) {
3534  const auto cd =
3535  catalog_->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
3536  CHECK(cd);
3537  if (ExpressionRange::typeSupportsRange(cd->columnType)) {
3538  const auto col_var = boost::make_unique<Analyzer::ColumnVar>(
3539  cd->columnType, phys_input.table_id, phys_input.col_id, 0);
3540  const auto col_range = getLeafColumnRange(col_var.get(), query_infos, this, false);
3541  agg_col_range_cache.setColRange(phys_input, col_range);
3542  }
3543  }
3544  return agg_col_range_cache;
3545 }
Fragmenter_Namespace::TableInfo getTableInfo(const int table_id) const
Definition: Execute.cpp:290
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
ExpressionRange getLeafColumnRange(const Analyzer::ColumnVar *col_expr, const std::vector< InputTableInfo > &query_infos, const Executor *executor, const bool is_outer_join_proj)
#define CHECK(condition)
Definition: Logger.h:197
void setColRange(const PhysicalInput &, const ExpressionRange &)
static bool typeSupportsRange(const SQLTypeInfo &ti)

+ Here is the call graph for this function:

StringDictionaryGenerations Executor::computeStringDictionaryGenerations ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 3547 of file Execute.cpp.

References catalog_(), CHECK, kENCODING_DICT, and StringDictionaryGenerations::setGeneration().

3548  {
3549  StringDictionaryGenerations string_dictionary_generations;
3550  CHECK(catalog_);
3551  for (const auto& phys_input : phys_inputs) {
3552  const auto cd =
3553  catalog_->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
3554  CHECK(cd);
3555  const auto& col_ti =
3556  cd->columnType.is_array() ? cd->columnType.get_elem_type() : cd->columnType;
3557  if (col_ti.is_string() && col_ti.get_compression() == kENCODING_DICT) {
3558  const int dict_id = col_ti.get_comp_param();
3559  const auto dd = catalog_->getMetadataForDict(dict_id);
3560  CHECK(dd && dd->stringDict);
3561  string_dictionary_generations.setGeneration(dict_id,
3562  dd->stringDict->storageEntryCount());
3563  }
3564  }
3565  return string_dictionary_generations;
3566 }
void setGeneration(const uint32_t id, const uint64_t generation)
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
const DictDescriptor * getMetadataForDict(int dict_ref, bool loadDict=true) const
Definition: Catalog.cpp:1439
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

TableGenerations Executor::computeTableGenerations ( std::unordered_set< int >  phys_table_ids)
private

Definition at line 3568 of file Execute.cpp.

References TableGenerations::setGeneration().

3569  {
3570  TableGenerations table_generations;
3571  for (const int table_id : phys_table_ids) {
3572  const auto table_info = getTableInfo(table_id);
3573  table_generations.setGeneration(
3574  table_id,
3575  TableGeneration{static_cast<int64_t>(table_info.getPhysicalNumTuples()), 0});
3576  }
3577  return table_generations;
3578 }
Fragmenter_Namespace::TableInfo getTableInfo(const int table_id) const
Definition: Execute.cpp:290
void setGeneration(const uint32_t id, const TableGeneration &generation)

+ Here is the call graph for this function:

bool Executor::containsLeftDeepOuterJoin ( ) const
inline

Definition at line 424 of file Execute.h.

References cgen_state_.

424  {
425  return cgen_state_->contains_left_deep_outer_join_;
426  }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
void Executor::createErrorCheckControlFlow ( llvm::Function *  query_func,
bool  run_with_dynamic_watchdog,
bool  run_with_allowing_runtime_interrupt,
ExecutorDeviceType  device_type,
const std::vector< InputTableInfo > &  input_table_infos 
)
private

Definition at line 1801 of file NativeCodegen.cpp.

1806  {
1808 
1809  // check whether the row processing was successful; currently, it can
1810  // fail by running out of group by buffer slots
1811 
1812  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1813  // when both dynamic watchdog and runtime interrupt turns on
1814  // we use dynamic watchdog
1815  run_with_allowing_runtime_interrupt = false;
1816  }
1817 
1818  {
1819  // disable injecting query interrupt checker if the session info is invalid
1820  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1821  if (current_query_session_.empty()) {
1822  run_with_allowing_runtime_interrupt = false;
1823  }
1824  }
1825 
1826  llvm::Value* row_count = nullptr;
1827  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1828  device_type == ExecutorDeviceType::GPU) {
1829  row_count =
1830  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1831  }
1832 
1833  bool done_splitting = false;
1834  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1835  ++bb_it) {
1836  llvm::Value* pos = nullptr;
1837  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1838  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1839  llvm::isa<llvm::PHINode>(*inst_it)) {
1840  if (inst_it->getName() == "pos") {
1841  pos = &*inst_it;
1842  }
1843  continue;
1844  }
1845  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1846  continue;
1847  }
1848  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1849  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1850  auto next_inst_it = inst_it;
1851  ++next_inst_it;
1852  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1853  auto& br_instr = bb_it->back();
1854  llvm::IRBuilder<> ir_builder(&br_instr);
1855  llvm::Value* err_lv = &*inst_it;
1856  llvm::Value* err_lv_returned_from_row_func = nullptr;
1857  if (run_with_dynamic_watchdog) {
1858  CHECK(pos);
1859  llvm::Value* call_watchdog_lv = nullptr;
1860  if (device_type == ExecutorDeviceType::GPU) {
1861  // In order to make sure all threads within a block see the same barrier,
1862  // only those blocks whose none of their threads have experienced the critical
1863  // edge will go through the dynamic watchdog computation
1864  CHECK(row_count);
1865  auto crit_edge_rem =
1866  (blockSize() & (blockSize() - 1))
1867  ? ir_builder.CreateSRem(
1868  row_count,
1869  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1870  : ir_builder.CreateAnd(
1871  row_count,
1872  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1873  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1874  crit_edge_threshold->setName("crit_edge_threshold");
1875 
1876  // only those threads where pos < crit_edge_threshold go through dynamic
1877  // watchdog call
1878  call_watchdog_lv =
1879  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1880  } else {
1881  // CPU path: run watchdog for every 64th row
1882  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1883  call_watchdog_lv = ir_builder.CreateICmp(
1884  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1885  }
1886  CHECK(call_watchdog_lv);
1887  auto error_check_bb = bb_it->splitBasicBlock(
1888  llvm::BasicBlock::iterator(br_instr), ".error_check");
1889  auto& watchdog_br_instr = bb_it->back();
1890 
1891  auto watchdog_check_bb = llvm::BasicBlock::Create(
1892  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1893  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1894  auto detected_timeout = watchdog_ir_builder.CreateCall(
1895  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1896  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1897  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1898  watchdog_ir_builder.CreateBr(error_check_bb);
1899 
1900  llvm::ReplaceInstWithInst(
1901  &watchdog_br_instr,
1902  llvm::BranchInst::Create(
1903  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1904  ir_builder.SetInsertPoint(&br_instr);
1905  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1906 
1907  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1908  unified_err_lv->addIncoming(err_lv, &*bb_it);
1909  err_lv = unified_err_lv;
1910  } else if (run_with_allowing_runtime_interrupt) {
1911  CHECK(pos);
1912  llvm::Value* call_check_interrupt_lv = nullptr;
1913  if (device_type == ExecutorDeviceType::GPU) {
1914  // approximate how many times the %pos variable
1915  // is increased --> the number of iteration
1916  // here we calculate the # bit shift by considering grid/block/fragment sizes
1917  // since if we use the fixed one (i.e., per 64-th increment)
1918  // some CUDA threads cannot enter the interrupt checking block depending on
1919  // the fragment size --> a thread may not take care of 64 threads if an outer
1920  // table is not sufficiently large, and so cannot be interrupted
1921  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
1922  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
1923  int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1924  uint64_t interrupt_checking_freq = 32;
1925  auto freq_control_knob = g_running_query_interrupt_freq;
1926  CHECK_GT(freq_control_knob, 0);
1927  CHECK_LE(freq_control_knob, 1.0);
1928  if (!input_table_infos.empty()) {
1929  const auto& outer_table_info = *input_table_infos.begin();
1930  auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
1931  if (outer_table_info.table_id < 0) {
1932  auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
1933  CHECK(rs);
1934  num_outer_table_tuples = rs->entryCount();
1935  } else {
1936  auto num_frags = outer_table_info.info.fragments.size();
1937  if (num_frags > 0) {
1938  num_outer_table_tuples =
1939  outer_table_info.info.fragments.begin()->getNumTuples();
1940  }
1941  }
1942  if (num_outer_table_tuples > 0) {
1943  // gridSize * blockSize --> pos_step (idx of the next row per thread)
1944  // we additionally multiply two to pos_step since the number of
1945  // dispatched blocks are double of the gridSize
1946  // # tuples (of fragment) / pos_step --> maximum # increment (K)
1947  // also we multiply 1 / freq_control_knob to K to control the frequency
1948  // So, needs to check the interrupt status more frequently? make K smaller
1949  auto max_inc = uint64_t(
1950  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1951  if (max_inc < 2) {
1952  // too small `max_inc`, so this correction is necessary to make
1953  // `interrupt_checking_freq` be valid (i.e., larger than zero)
1954  max_inc = 2;
1955  }
1956  auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
1957  interrupt_checking_freq =
1958  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
1959  // add the coverage when interrupt_checking_freq > K
1960  // if so, some threads still cannot be branched to the interrupt checker
1961  // so we manually use smaller but close to the max_inc as freq
1962  if (interrupt_checking_freq > max_inc) {
1963  interrupt_checking_freq = max_inc / 2;
1964  }
1965  if (interrupt_checking_freq < 8) {
1966  // such small freq incurs too frequent interrupt status checking,
1967  // so we fixup to the minimum freq value at some reasonable degree
1968  interrupt_checking_freq = 8;
1969  }
1970  }
1971  }
1972  VLOG(1) << "Set the running query interrupt checking frequency: "
1973  << interrupt_checking_freq;
1974  // check the interrupt flag for every interrupt_checking_freq-th iteration
1975  llvm::Value* pos_shifted_per_iteration =
1976  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1977  auto interrupt_predicate =
1978  ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
1979  call_check_interrupt_lv =
1980  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1981  interrupt_predicate,
1982  cgen_state_->llInt(int64_t(0LL)));
1983  } else {
1984  // CPU path: run interrupt checker for every 64th row
1985  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1986  call_check_interrupt_lv =
1987  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1988  interrupt_predicate,
1989  cgen_state_->llInt(int64_t(0LL)));
1990  }
1991  CHECK(call_check_interrupt_lv);
1992  auto error_check_bb = bb_it->splitBasicBlock(
1993  llvm::BasicBlock::iterator(br_instr), ".error_check");
1994  auto& check_interrupt_br_instr = bb_it->back();
1995 
1996  auto interrupt_check_bb = llvm::BasicBlock::Create(
1997  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1998  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1999  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2000  cgen_state_->module_->getFunction("check_interrupt"), {});
2001  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
2002  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
2003  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2004 
2005  llvm::ReplaceInstWithInst(
2006  &check_interrupt_br_instr,
2007  llvm::BranchInst::Create(
2008  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
2009  ir_builder.SetInsertPoint(&br_instr);
2010  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2011 
2012  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
2013  unified_err_lv->addIncoming(err_lv, &*bb_it);
2014  err_lv = unified_err_lv;
2015  }
2016  if (!err_lv_returned_from_row_func) {
2017  err_lv_returned_from_row_func = err_lv;
2018  }
2019  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
2020  // let kernel execution finish as expected, regardless of the observed error,
2021  // unless it is from the dynamic watchdog where all threads within that block
2022  // return together.
2023  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2024  err_lv,
2026  } else {
2027  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
2028  err_lv,
2029  cgen_state_->llInt(static_cast<int32_t>(0)));
2030  }
2031  auto error_bb = llvm::BasicBlock::Create(
2032  cgen_state_->context_, ".error_exit", query_func, new_bb);
2033  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
2034  llvm::CallInst::Create(
2035  cgen_state_->module_->getFunction("record_error_code"),
2036  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
2037  "",
2038  error_bb);
2039  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2040  llvm::ReplaceInstWithInst(&br_instr,
2041  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2042  done_splitting = true;
2043  break;
2044  }
2045  }
2046  }
2047  CHECK(done_splitting);
2048 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1044
double g_running_query_interrupt_freq
Definition: Execute.cpp:113
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1076
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
#define CHECK_GT(x, y)
Definition: Logger.h:209
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:162
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1075
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static QuerySessionId current_query_session_
Definition: Execute.h:1046
#define CHECK_LE(x, y)
Definition: Logger.h:208
unsigned gridSize() const
Definition: Execute.cpp:3156
#define CHECK(condition)
Definition: Logger.h:197
unsigned blockSize() const
Definition: Execute.cpp:3173
#define VLOG(n)
Definition: Logger.h:291
std::vector< std::unique_ptr< ExecutionKernel > > Executor::createKernels ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
ColumnFetcher column_fetcher,
const std::vector< InputTableInfo > &  table_infos,
const ExecutionOptions eo,
const bool  is_agg,
const bool  allow_single_frag_table_opt,
const size_t  context_count,
const QueryCompilationDescriptor query_comp_desc,
const QueryMemoryDescriptor query_mem_desc,
RenderInfo render_info,
std::unordered_set< int > &  available_gpus,
int &  available_cpus 
)
private

Determines execution dispatch mode and required fragments for a given query step, then creates kernels to execute the query and returns them for launch.

Definition at line 1988 of file Execute.cpp.

References ExecutionOptions::allow_multifrag, catalog_(), CHECK, CHECK_GE, CHECK_GT, anonymous_namespace{Execute.cpp}::checkWorkUnitWatchdog(), g_inner_join_fragment_skipping, QueryCompilationDescriptor::getDeviceType(), QueryMemoryDescriptor::getEntryCount(), SharedKernelContext::getFragOffsets(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, ExecutionOptions::gpu_input_mem_limit_percent, Data_Namespace::GPU_LEVEL, anonymous_namespace{Execute.cpp}::has_lazy_fetched_columns(), logger::INFO, RelAlgExecutionUnit::input_descs, KernelPerFragment, LOG, MultifragmentKernel, ExecutionOptions::outer_fragment_indices, Projection, query_mem_desc, RelAlgExecutionUnit::target_exprs, QueryMemoryDescriptor::toString(), RelAlgExecutionUnit::use_bump_allocator, VLOG, and ExecutionOptions::with_watchdog.

2001  {
2002  std::vector<std::unique_ptr<ExecutionKernel>> execution_kernels;
2003 
2004  QueryFragmentDescriptor fragment_descriptor(
2005  ra_exe_unit,
2006  table_infos,
2007  query_comp_desc.getDeviceType() == ExecutorDeviceType::GPU
2009  : std::vector<Data_Namespace::MemoryInfo>{},
2012  CHECK(!ra_exe_unit.input_descs.empty());
2013 
2014  const auto device_type = query_comp_desc.getDeviceType();
2015  const bool uses_lazy_fetch =
2016  plan_state_->allow_lazy_fetch_ &&
2018  const bool use_multifrag_kernel = (device_type == ExecutorDeviceType::GPU) &&
2019  eo.allow_multifrag && (!uses_lazy_fetch || is_agg);
2020  const auto device_count = deviceCount(device_type);
2021  CHECK_GT(device_count, 0);
2022 
2023  fragment_descriptor.buildFragmentKernelMap(ra_exe_unit,
2024  shared_context.getFragOffsets(),
2025  device_count,
2026  device_type,
2027  use_multifrag_kernel,
2029  this);
2030  if (eo.with_watchdog && fragment_descriptor.shouldCheckWorkUnitWatchdog()) {
2031  checkWorkUnitWatchdog(ra_exe_unit, table_infos, *catalog_, device_type, device_count);
2032  }
2033 
2034  if (use_multifrag_kernel) {
2035  VLOG(1) << "Creating multifrag execution kernels";
2036  VLOG(1) << query_mem_desc.toString();
2037 
2038  // NB: We should never be on this path when the query is retried because of running
2039  // out of group by slots; also, for scan only queries on CPU we want the
2040  // high-granularity, fragment by fragment execution instead. For scan only queries on
2041  // GPU, we want the multifrag kernel path to save the overhead of allocating an output
2042  // buffer per fragment.
2043  auto multifrag_kernel_dispatch = [&ra_exe_unit,
2044  &execution_kernels,
2045  &column_fetcher,
2046  &eo,
2047  &query_comp_desc,
2048  &query_mem_desc,
2049  render_info](const int device_id,
2050  const FragmentsList& frag_list,
2051  const int64_t rowid_lookup_key) {
2052  execution_kernels.emplace_back(
2053  std::make_unique<ExecutionKernel>(ra_exe_unit,
2055  device_id,
2056  eo,
2057  column_fetcher,
2058  query_comp_desc,
2059  query_mem_desc,
2060  frag_list,
2062  render_info,
2063  rowid_lookup_key));
2064  };
2065  fragment_descriptor.assignFragsToMultiDispatch(multifrag_kernel_dispatch);
2066  } else {
2067  VLOG(1) << "Creating one execution kernel per fragment";
2068  VLOG(1) << query_mem_desc.toString();
2069 
2070  if (!ra_exe_unit.use_bump_allocator && allow_single_frag_table_opt &&
2071  (query_mem_desc.getQueryDescriptionType() == QueryDescriptionType::Projection) &&
2072  table_infos.size() == 1 && table_infos.front().table_id > 0) {
2073  const auto max_frag_size =
2074  table_infos.front().info.getFragmentNumTuplesUpperBound();
2075  if (max_frag_size < query_mem_desc.getEntryCount()) {
2076  LOG(INFO) << "Lowering scan limit from " << query_mem_desc.getEntryCount()
2077  << " to match max fragment size " << max_frag_size
2078  << " for kernel per fragment execution path.";
2079  throw CompilationRetryNewScanLimit(max_frag_size);
2080  }
2081  }
2082 
2083  size_t frag_list_idx{0};
2084  auto fragment_per_kernel_dispatch = [&ra_exe_unit,
2085  &execution_kernels,
2086  &column_fetcher,
2087  &eo,
2088  &frag_list_idx,
2089  &device_type,
2090  &query_comp_desc,
2091  &query_mem_desc,
2092  render_info](const int device_id,
2093  const FragmentsList& frag_list,
2094  const int64_t rowid_lookup_key) {
2095  if (!frag_list.size()) {
2096  return;
2097  }
2098  CHECK_GE(device_id, 0);
2099 
2100  execution_kernels.emplace_back(
2101  std::make_unique<ExecutionKernel>(ra_exe_unit,
2102  device_type,
2103  device_id,
2104  eo,
2105  column_fetcher,
2106  query_comp_desc,
2107  query_mem_desc,
2108  frag_list,
2110  render_info,
2111  rowid_lookup_key));
2112  ++frag_list_idx;
2113  };
2114 
2115  fragment_descriptor.assignFragsToKernelDispatch(fragment_per_kernel_dispatch,
2116  ra_exe_unit);
2117  }
2118 
2119  return execution_kernels;
2120 }
bool is_agg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
ExecutorDeviceType getDeviceType() const
const std::vector< uint64_t > & getFragOffsets()
std::string toString() const
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:222
#define LOG(tag)
Definition: Logger.h:188
std::vector< ColumnLazyFetchInfo > getColLazyFetchInfo(const std::vector< Analyzer::Expr * > &target_exprs) const
Definition: Execute.cpp:332
std::vector< InputDescriptor > input_descs
#define CHECK_GE(x, y)
Definition: Logger.h:210
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:641
#define CHECK_GT(x, y)
Definition: Logger.h:209
std::vector< FragmentsPerTable > FragmentsList
bool g_inner_join_fragment_skipping
Definition: Execute.cpp:85
const bool allow_multifrag
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
const double gpu_input_mem_limit_percent
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1000
void checkWorkUnitWatchdog(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const Catalog_Namespace::Catalog &cat, const ExecutorDeviceType device_type, const int device_count)
Definition: Execute.cpp:1104
std::vector< MemoryInfo > getMemoryInfo(const MemoryLevel memLevel)
Definition: DataMgr.cpp:304
const std::vector< size_t > outer_fragment_indices
#define CHECK(condition)
Definition: Logger.h:197
bool has_lazy_fetched_columns(const std::vector< ColumnLazyFetchInfo > &fetched_cols)
Definition: Execute.cpp:1977
#define VLOG(n)
Definition: Logger.h:291
const bool with_watchdog

+ Here is the call graph for this function:

int Executor::deviceCount ( const ExecutorDeviceType  device_type) const
private

Definition at line 641 of file Execute.cpp.

References catalog_(), CHECK, and GPU.

641  {
642  if (device_type == ExecutorDeviceType::GPU) {
643  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
644  CHECK(cuda_mgr);
645  return cuda_mgr->getDeviceCount();
646  } else {
647  return 1;
648  }
649 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:206
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:222
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

int Executor::deviceCountForMemoryLevel ( const Data_Namespace::MemoryLevel  memory_level) const
private

Definition at line 651 of file Execute.cpp.

References CPU, GPU, and Data_Namespace::GPU_LEVEL.

652  {
653  return memory_level == GPU_LEVEL ? deviceCount(ExecutorDeviceType::GPU)
654  : deviceCount(ExecutorDeviceType::CPU);
655 }
ExecutorDeviceType
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:641
int64_t Executor::deviceCycles ( int  milliseconds) const
private

Definition at line 3187 of file Execute.cpp.

References catalog_(), and CHECK.

3187  {
3188  CHECK(catalog_);
3189  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
3190  CHECK(cuda_mgr);
3191  const auto& dev_props = cuda_mgr->getAllDeviceProperties();
3192  return static_cast<int64_t>(dev_props.front().clockKhz) * milliseconds;
3193 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:206
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:222
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1029
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

void Executor::enableRuntimeQueryInterrupt ( const double  runtime_query_check_freq,
const unsigned  pending_query_check_freq 
) const

Definition at line 3895 of file Execute.cpp.

References g_enable_runtime_query_interrupt, g_pending_query_interrupt_freq, and g_running_query_interrupt_freq.

3897  {
3898  // The only one scenario that we intentionally call this function is
3899  // to allow runtime query interrupt in QueryRunner for test cases.
3900  // Because test machine's default setting does not allow runtime query interrupt,
3901  // so we have to turn it on within test code if necessary.
3903  g_pending_query_interrupt_freq = pending_query_check_freq;
3904  g_running_query_interrupt_freq = runtime_query_check_freq;
3907  }
3908 }
double g_running_query_interrupt_freq
Definition: Execute.cpp:113
unsigned g_pending_query_interrupt_freq
Definition: Execute.cpp:112
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:110
void Executor::enrollQuerySession ( const QuerySessionId query_session,
const std::string &  query_str,
const std::chrono::time_point< std::chrono::system_clock >  submitted,
const size_t  executor_id,
const QuerySessionStatus::QueryStatus  query_session_status 
)

Definition at line 3730 of file Execute.cpp.

3735  {
3736  // enroll the query session into the Executor's session map
3737  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
3738  if (query_session.empty()) {
3739  return;
3740  }
3741  addToQuerySessionList(query_session,
3742  query_str,
3743  submitted,
3744  executor_id,
3745  query_session_status,
3746  session_write_lock);
3747 }
static mapd_shared_mutex executor_session_mutex_
Definition: Execute.h:1044
bool addToQuerySessionList(const QuerySessionId &query_session, const std::string &query_str, const std::chrono::time_point< std::chrono::system_clock > submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, mapd_unique_lock< mapd_shared_mutex > &write_lock)
Definition: Execute.cpp:3749
ResultSetPtr Executor::executeExplain ( const QueryCompilationDescriptor query_comp_desc)
private

Definition at line 1661 of file Execute.cpp.

References QueryCompilationDescriptor::getIR().

1661  {
1662  return std::make_shared<ResultSet>(query_comp_desc.getIR());
1663 }

+ Here is the call graph for this function:

int32_t Executor::executePlanWithGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< size_t >  outer_tab_frag_ids,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const int  outer_table_id,
const int64_t  limit,
const uint32_t  start_rowid,
const uint32_t  num_tables,
RenderInfo render_info 
)
private

Definition at line 2897 of file Execute.cpp.

References CHECK, CHECK_NE, anonymous_namespace{Execute.cpp}::check_rows_less_than_needed(), CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, logger::FATAL, g_enable_dynamic_watchdog, g_enable_runtime_query_interrupt, CompilationResult::generated_code, QueryMemoryDescriptor::getEntryCount(), QueryExecutionContext::getRowSet(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, RelAlgExecutionUnit::groupby_exprs, INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, shared::printContainer(), QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, RenderInfo::render_allocator_map_ptr, RelAlgExecutionUnit::scan_limit, QueryMemoryDescriptor::setEntryCount(), RelAlgExecutionUnit::union_all, RenderInfo::useCudaBuffers(), and VLOG.

2914  {
2915  auto timer = DEBUG_TIMER(__func__);
2917  CHECK(!results);
2918  if (col_buffers.empty()) {
2919  return 0;
2920  }
2921  CHECK_NE(ra_exe_unit.groupby_exprs.size(), size_t(0));
2922  // TODO(alex):
2923  // 1. Optimize size (make keys more compact).
2924  // 2. Resize on overflow.
2925  // 3. Optimize runtime.
2926  auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
2927  int32_t error_code = device_type == ExecutorDeviceType::GPU ? 0 : start_rowid;
2928  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
2930  interrupted_.load()) {
2931  return ERR_INTERRUPTED;
2932  }
2933 
2934  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
2935  if (render_info && render_info->useCudaBuffers()) {
2936  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
2937  }
2938 
2939  VLOG(2) << "bool(ra_exe_unit.union_all)=" << bool(ra_exe_unit.union_all)
2940  << " ra_exe_unit.input_descs="
2941  << shared::printContainer(ra_exe_unit.input_descs)
2942  << " ra_exe_unit.input_col_descs="
2943  << shared::printContainer(ra_exe_unit.input_col_descs)
2944  << " ra_exe_unit.scan_limit=" << ra_exe_unit.scan_limit
2945  << " num_rows=" << shared::printContainer(num_rows)
2946  << " frag_offsets=" << shared::printContainer(frag_offsets)
2947  << " query_exe_context->query_buffers_->num_rows_="
2948  << query_exe_context->query_buffers_->num_rows_
2949  << " query_exe_context->query_mem_desc_.getEntryCount()="
2950  << query_exe_context->query_mem_desc_.getEntryCount()
2951  << " device_id=" << device_id << " outer_table_id=" << outer_table_id
2952  << " scan_limit=" << scan_limit << " start_rowid=" << start_rowid
2953  << " num_tables=" << num_tables;
2954 
2955  RelAlgExecutionUnit ra_exe_unit_copy = ra_exe_unit;
2956  // For UNION ALL, filter out input_descs and input_col_descs that are not associated
2957  // with outer_table_id.
2958  if (ra_exe_unit_copy.union_all) {
2959  // Sort outer_table_id first, then pop the rest off of ra_exe_unit_copy.input_descs.
2960  std::stable_sort(ra_exe_unit_copy.input_descs.begin(),
2961  ra_exe_unit_copy.input_descs.end(),
2962  [outer_table_id](auto const& a, auto const& b) {
2963  return a.getTableId() == outer_table_id &&
2964  b.getTableId() != outer_table_id;
2965  });
2966  while (!ra_exe_unit_copy.input_descs.empty() &&
2967  ra_exe_unit_copy.input_descs.back().getTableId() != outer_table_id) {
2968  ra_exe_unit_copy.input_descs.pop_back();
2969  }
2970  // Filter ra_exe_unit_copy.input_col_descs.
2971  ra_exe_unit_copy.input_col_descs.remove_if(
2972  [outer_table_id](auto const& input_col_desc) {
2973  return input_col_desc->getScanDesc().getTableId() != outer_table_id;
2974  });
2975  query_exe_context->query_mem_desc_.setEntryCount(ra_exe_unit_copy.scan_limit);
2976  }
2977 
2978  if (device_type == ExecutorDeviceType::CPU) {
2979  auto cpu_generated_code = std::dynamic_pointer_cast<CpuCompilationContext>(
2980  compilation_result.generated_code);
2981  CHECK(cpu_generated_code);
2982  query_exe_context->launchCpuCode(
2983  ra_exe_unit_copy,
2984  cpu_generated_code.get(),
2985  hoist_literals,
2986  hoist_buf,
2987  col_buffers,
2988  num_rows,
2989  frag_offsets,
2990  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit,
2991  &error_code,
2992  num_tables,
2993  join_hash_table_ptrs);
2994  } else {
2995  try {
2996  auto gpu_generated_code = std::dynamic_pointer_cast<GpuCompilationContext>(
2997  compilation_result.generated_code);
2998  CHECK(gpu_generated_code);
2999  query_exe_context->launchGpuCode(
3000  ra_exe_unit_copy,
3001  gpu_generated_code.get(),
3002  hoist_literals,
3003  hoist_buf,
3004  col_buffers,
3005  num_rows,
3006  frag_offsets,
3007  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit,
3008  data_mgr,
3009  blockSize(),
3010  gridSize(),
3011  device_id,
3012  compilation_result.gpu_smem_context.getSharedMemorySize(),
3013  &error_code,
3014  num_tables,
3015  join_hash_table_ptrs,
3016  render_allocator_map_ptr);
3017  } catch (const OutOfMemory&) {
3018  return ERR_OUT_OF_GPU_MEM;
3019  } catch (const OutOfRenderMemory&) {
3020  return ERR_OUT_OF_RENDER_MEM;
3021  } catch (const StreamingTopNNotSupportedInRenderQuery&) {
3023  } catch (const std::exception& e) {
3024  LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
3025  }
3026  }
3027 
3028  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
3029  error_code == Executor::ERR_DIV_BY_ZERO ||
3030  error_code == Executor::ERR_OUT_OF_TIME ||
3031  error_code == Executor::ERR_INTERRUPTED ||
3033  error_code == Executor::ERR_GEOS) {
3034  return error_code;
3035  }
3036 
3037  if (error_code != Executor::ERR_OVERFLOW_OR_UNDERFLOW &&
3038  error_code != Executor::ERR_DIV_BY_ZERO && !render_allocator_map_ptr) {
3039  results = query_exe_context->getRowSet(ra_exe_unit_copy,
3040  query_exe_context->query_mem_desc_);
3041  CHECK(results);
3042  VLOG(2) << "results->rowCount()=" << results->rowCount();
3043  results->holdLiterals(hoist_buf);
3044  }
3045  if (error_code < 0 && render_allocator_map_ptr) {
3046  auto const adjusted_scan_limit =
3047  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
3048  // More rows passed the filter than available slots. We don't have a count to check,
3049  // so assume we met the limit if a scan limit is set
3050  if (adjusted_scan_limit != 0) {
3051  return 0;
3052  } else {
3053  return error_code;
3054  }
3055  }
3056  if (error_code && (!scan_limit || check_rows_less_than_needed(results, scan_limit))) {
3057  return error_code; // unlucky, not enough results and we ran out of slots
3058  }
3059 
3060  return 0;
3061 }
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1076
void setEntryCount(const size_t val)
GpuSharedMemoryContext gpu_smem_context
const std::optional< bool > union_all
#define LOG(tag)
Definition: Logger.h:188
size_t getSharedMemorySize() const
std::vector< InputDescriptor > input_descs
static const int32_t ERR_GEOS
Definition: Execute.h:1082
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
std::vector< int64_t * > launchGpuCode(const RelAlgExecutionUnit &ra_exe_unit, const GpuCompilationContext *cu_functions, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables, RenderAllocatorMap *render_allocator_map)
std::unique_ptr< QueryMemoryInitializer > query_buffers_
int32_t executePlanWithGroupBy(const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const int outer_table_id, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, RenderInfo *render_info)
Definition: Execute.cpp:2897
std::vector< int64_t > getJoinHashTablePtrs(const ExecutorDeviceType device_type, const int device_id)
Definition: Execute.cpp:3063
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY
Definition: Execute.h:1080
static const int32_t ERR_DIV_BY_ZERO
Definition: Execute.h:1068
ResultSetPtr getRowSet(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc) const
#define INJECT_TIMER(DESC)
Definition: measure.h:93
#define CHECK_NE(x, y)
Definition: Logger.h:206
static const int32_t ERR_OUT_OF_RENDER_MEM
Definition: Execute.h:1072
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:1074
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1075
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:1081
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:1069
std::shared_ptr< CompilationContext > generated_code
QueryMemoryDescriptor query_mem_desc_
std::vector< int8_t > serializeLiterals(const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
Definition: Execute.cpp:379
unsigned gridSize() const
Definition: Execute.cpp:3156
std::unordered_map< int, CgenState::LiteralValues > literal_values
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
Definition: RenderInfo.h:32
bool check_rows_less_than_needed(const ResultSetPtr &results, const size_t scan_limit)
Definition: Execute.cpp:2890
static std::atomic< bool > interrupted_
Definition: Execute.h:1009
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables)
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:64
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
unsigned blockSize() const
Definition: Execute.cpp:3173
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:110
#define VLOG(n)
Definition: Logger.h:291

+ Here is the call graph for this function:

int32_t Executor::executePlanWithoutGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const std::vector< Analyzer::Expr * > &  target_exprs,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t * >> &  col_buffers,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const uint32_t  start_rowid,
const uint32_t  num_tables,
RenderInfo render_info 
)
private

Definition at line 2697 of file Execute.cpp.

References CHECK, CHECK_EQ, CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, RelAlgExecutionUnit::estimator, QueryExecutionContext::estimator_result_set_, logger::FATAL, g_bigint_count, g_enable_dynamic_watchdog, g_enable_runtime_query_interrupt, CompilationResult::generated_code, get_target_info(), QueryExecutionContext::getAggInitValForIndex(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, INJECT_TIMER, is_distinct_target(), RenderInfo::isPotentialInSituRender(), GpuSharedMemoryContext::isSharedMemoryUsed(), kAPPROX_COUNT_DISTINCT, kAVG, kCOUNT, kSAMPLE, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, reduceResults(), RenderInfo::render_allocator_map_ptr, takes_float_argument(), and RenderInfo::useCudaBuffers().

2712  {
2714  auto timer = DEBUG_TIMER(__func__);
2715  CHECK(!results);
2716  if (col_buffers.empty()) {
2717  return 0;
2718  }
2719 
2720  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
2721  if (render_info) {
2722  // TODO(adb): make sure that we either never get here in the CPU case, or if we do get
2723  // here, we are in non-insitu mode.
2724  CHECK(render_info->useCudaBuffers() || !render_info->isPotentialInSituRender())
2725  << "CUDA disabled rendering in the executePlanWithoutGroupBy query path is "
2726  "currently unsupported.";
2727  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
2728  }
2729 
2730  int32_t error_code = device_type == ExecutorDeviceType::GPU ? 0 : start_rowid;
2731  std::vector<int64_t*> out_vec;
2732  const auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
2733  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
2734  std::unique_ptr<OutVecOwner> output_memory_scope;
2736  interrupted_.load()) {
2737  resetInterrupt();
2739  }
2740  if (device_type == ExecutorDeviceType::CPU) {
2741  auto cpu_generated_code = std::dynamic_pointer_cast<CpuCompilationContext>(
2742  compilation_result.generated_code);
2743  CHECK(cpu_generated_code);
2744  out_vec = query_exe_context->launchCpuCode(ra_exe_unit,
2745  cpu_generated_code.get(),
2746  hoist_literals,
2747  hoist_buf,
2748  col_buffers,
2749  num_rows,
2750  frag_offsets,
2751  0,
2752  &error_code,
2753  num_tables,
2754  join_hash_table_ptrs);
2755  output_memory_scope.reset(new OutVecOwner(out_vec));
2756  } else {
2757  auto gpu_generated_code = std::dynamic_pointer_cast<GpuCompilationContext>(
2758  compilation_result.generated_code);
2759  CHECK(gpu_generated_code);
2760  try {
2761  out_vec = query_exe_context->launchGpuCode(
2762  ra_exe_unit,
2763  gpu_generated_code.get(),
2764  hoist_literals,
2765  hoist_buf,
2766  col_buffers,
2767  num_rows,
2768  frag_offsets,
2769  0,
2770  data_mgr,
2771  blockSize(),
2772  gridSize(),
2773  device_id,
2774  compilation_result.gpu_smem_context.getSharedMemorySize(),
2775  &error_code,
2776  num_tables,
2777  join_hash_table_ptrs,
2778  render_allocator_map_ptr);
2779  output_memory_scope.reset(new OutVecOwner(out_vec));
2780  } catch (const OutOfMemory&) {
2781  return ERR_OUT_OF_GPU_MEM;
2782  } catch (const std::exception& e) {
2783  LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
2784  }
2785  }
2786  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
2787  error_code == Executor::ERR_DIV_BY_ZERO ||
2788  error_code == Executor::ERR_OUT_OF_TIME ||
2789  error_code == Executor::ERR_INTERRUPTED ||
2791  error_code == Executor::ERR_GEOS) {
2792  return error_code;
2793  }
2794  if (ra_exe_unit.estimator) {
2795  CHECK(!error_code);
2796  results =
2797  std::shared_ptr<ResultSet>(query_exe_context->estimator_result_set_.release());
2798  return 0;
2799  }
2800  std::vector<int64_t> reduced_outs;
2801  const auto num_frags = col_buffers.size();
2802  const size_t entry_count =
2803  device_type == ExecutorDeviceType::GPU
2804  ? (compilation_result.gpu_smem_context.isSharedMemoryUsed()
2805  ? 1
2806  : blockSize() * gridSize() * num_frags)
2807  : num_frags;
2808  if (size_t(1) == entry_count) {
2809  for (auto out : out_vec) {
2810  CHECK(out);
2811  reduced_outs.push_back(*out);
2812  }
2813  } else {
2814  size_t out_vec_idx = 0;
2815 
2816  for (const auto target_expr : target_exprs) {
2817  const auto agg_info = get_target_info(target_expr, g_bigint_count);
2818  CHECK(agg_info.is_agg);
2819 
2820  const int num_iterations = agg_info.sql_type.is_geometry()
2821  ? agg_info.sql_type.get_physical_coord_cols()
2822  : 1;
2823 
2824  for (int i = 0; i < num_iterations; i++) {
2825  int64_t val1;
2826  const bool float_argument_input = takes_float_argument(agg_info);
2827  if (is_distinct_target(agg_info)) {
2828  CHECK(agg_info.agg_kind == kCOUNT ||
2829  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
2830  val1 = out_vec[out_vec_idx][0];
2831  error_code = 0;
2832  } else {
2833  const auto chosen_bytes = static_cast<size_t>(
2834  query_exe_context->query_mem_desc_.getPaddedSlotWidthBytes(out_vec_idx));
2835  std::tie(val1, error_code) = Executor::reduceResults(
2836  agg_info.agg_kind,
2837  agg_info.sql_type,
2838  query_exe_context->getAggInitValForIndex(out_vec_idx),
2839  float_argument_input ? sizeof(int32_t) : chosen_bytes,
2840  out_vec[out_vec_idx],
2841  entry_count,
2842  false,
2843  float_argument_input);
2844  }
2845  if (error_code) {
2846  break;
2847  }
2848  reduced_outs.push_back(val1);
2849  if (agg_info.agg_kind == kAVG ||
2850  (agg_info.agg_kind == kSAMPLE &&
2851  (agg_info.sql_type.is_varlen() || agg_info.sql_type.is_geometry()))) {
2852  const auto chosen_bytes = static_cast<size_t>(
2853  query_exe_context->query_mem_desc_.getPaddedSlotWidthBytes(out_vec_idx +
2854  1));
2855  int64_t val2;
2856  std::tie(val2, error_code) = Executor::reduceResults(
2857  agg_info.agg_kind == kAVG ? kCOUNT : agg_info.agg_kind,
2858  agg_info.sql_type,
2859  query_exe_context->getAggInitValForIndex(out_vec_idx + 1),
2860  float_argument_input ? sizeof(int32_t) : chosen_bytes,
2861  out_vec[out_vec_idx + 1],
2862  entry_count,
2863  false,
2864  false);
2865  if (error_code) {
2866  break;
2867  }
2868  reduced_outs.push_back(val2);
2869  ++out_vec_idx;
2870  }
2871  ++out_vec_idx;
2872  }
2873  }
2874  }
2875 
2876  if (error_code) {
2877  return error_code;
2878  }
2879 
2880  CHECK_EQ(size_t(1), query_exe_context->query_buffers_->result_sets_.size());
2881  auto rows_ptr = std::shared_ptr<ResultSet>(
2882  query_exe_context->query_buffers_->result_sets_[0].release());
2883  rows_ptr->fillOneEntry(reduced_outs);
2884  results = std::move(rows_ptr);
2885  return error_code;
2886 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1076
GpuSharedMemoryContext gpu_smem_context
#define LOG(tag)
Definition: Logger.h:188
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
size_t getSharedMemorySize() const
static std::pair< int64_t, int32_t > reduceResults(const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)
Definition: Execute.cpp:658
static const int32_t ERR_GEOS
Definition: Execute.h:1082
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:134
std::vector< int64_t * > launchGpuCode(const RelAlgExecutionUnit &ra_exe_unit, const GpuCompilationContext *cu_functions, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables, RenderAllocatorMap *render_allocator_map)
std::unique_ptr< QueryMemoryInitializer > query_buffers_
std::vector< int64_t > getJoinHashTablePtrs(const ExecutorDeviceType device_type, const int device_id)
Definition: Execute.cpp:3063
static const int32_t ERR_DIV_BY_ZERO
Definition: Execute.h:1068
#define INJECT_TIMER(DESC)
Definition: measure.h:93
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:1074
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1075
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
int64_t getAggInitValForIndex(const size_t index) const
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:1081
const std::shared_ptr< Analyzer::Estimator > estimator
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:1069
std::shared_ptr< CompilationContext > generated_code
QueryMemoryDescriptor query_mem_desc_
int32_t executePlanWithoutGroupBy(const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr &results, const std::vector< Analyzer::Expr * > &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, QueryExecutionContext *query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, RenderInfo *render_info)
Definition: Execute.cpp:2697
std::vector< int8_t > serializeLiterals(const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
Definition: Execute.cpp:379
Definition: sqldefs.h:76
unsigned gridSize() const
Definition: Execute.cpp:3156
std::unordered_map< int, CgenState::LiteralValues > literal_values
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
Definition: RenderInfo.h:32
void resetInterrupt()
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
static std::atomic< bool > interrupted_
Definition: Execute.h:1009
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t num_tables, const std::vector< int64_t > &join_hash_tables)
unsigned blockSize() const
Definition: Execute.cpp:3173
std::unique_ptr< ResultSet > estimator_result_set_
Definition: sqldefs.h:72
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:110

+ Here is the call graph for this function:

ResultSetPtr Executor::executeTableFunction ( const TableFunctionExecutionUnit  exe_unit,
const std::vector< InputTableInfo > &  table_infos,
const CompilationOptions co,
const ExecutionOptions eo,
const Catalog_Namespace::Catalog cat 
)
private

Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps.

Definition at line 1640 of file Execute.cpp.

References TableFunctionCompilationContext::compile(), CompilationOptions::device_type, TableFunctionExecutionContext::execute(), and INJECT_TIMER.

1645  {
1646  INJECT_TIMER(Exec_executeTableFunction);
1647  nukeOldState(false, table_infos, PlanState::DeletedColumnsMap{}, nullptr);
1648 
1649  ColumnCacheMap column_cache; // Note: if we add retries to the table function
1650  // framework, we may want to move this up a level
1651 
1652  ColumnFetcher column_fetcher(this, column_cache);
1653  TableFunctionCompilationContext compilation_context;
1654  compilation_context.compile(exe_unit, co, this);
1655 
1657  return exe_context.execute(
1658  exe_unit, table_infos, &compilation_context, column_fetcher, co.device_type, this);
1659 }
const std::shared_ptr< RowSetMemoryOwner > getRowSetMemoryOwner() const
Definition: Execute.cpp:282
#define INJECT_TIMER(DESC)
Definition: measure.h:93
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
ExecutorDeviceType device_type
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
void nukeOldState(const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
Definition: Execute.cpp:3078
void compile(const TableFunctionExecutionUnit &exe_unit, const CompilationOptions &co, Executor *executor)

+ Here is the call graph for this function:

void Executor::executeUpdate ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  table_infos,
const CompilationOptions co,
const ExecutionOptions eo,
const Catalog_Namespace::Catalog cat,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const UpdateLogForFragment::Callback cb,
const bool  is_agg 
)

Definition at line 62 of file ExecuteUpdate.cpp.

References CHECK, CHECK_EQ, CHECK_GT, CPU, executor_id_(), FragmentsPerTable::fragment_ids, SharedKernelContext::getFragmentResults(), KernelPerFragment, query_mem_desc, ExecutionKernel::run(), timer_start(), timer_stop(), and VLOG.

69  {
70  CHECK(cb);
71  VLOG(1) << "Executor " << executor_id_
72  << " is executing update/delete work unit:" << ra_exe_unit_in;
73 
74  const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
75  ColumnCacheMap column_cache;
76 
77  ColumnFetcher column_fetcher(this, column_cache);
78  CHECK_GT(ra_exe_unit.input_descs.size(), size_t(0));
79  const auto table_id = ra_exe_unit.input_descs[0].getTableId();
80  const auto& outer_fragments = table_infos.front().info.fragments;
81 
82  std::vector<FragmentsPerTable> fragments = {{0, {0}}};
83  for (size_t tab_idx = 1; tab_idx < ra_exe_unit.input_descs.size(); tab_idx++) {
84  int table_id = ra_exe_unit.input_descs[tab_idx].getTableId();
85  CHECK_EQ(table_infos[tab_idx].table_id, table_id);
86  const auto& fragmentsPerTable = table_infos[tab_idx].info.fragments;
87  FragmentsPerTable entry = {table_id, {}};
88  for (size_t innerFragId = 0; innerFragId < fragmentsPerTable.size(); innerFragId++) {
89  entry.fragment_ids.push_back(innerFragId);
90  }
91  fragments.push_back(entry);
92  }
93 
94  if (outer_fragments.empty()) {
95  return;
96  }
97 
98  const auto max_tuple_count_fragment_it = std::max_element(
99  outer_fragments.begin(), outer_fragments.end(), [](const auto& a, const auto& b) {
100  return a.getNumTuples() < b.getNumTuples();
101  });
102  CHECK(max_tuple_count_fragment_it != outer_fragments.end());
103  int64_t global_max_groups_buffer_entry_guess =
104  max_tuple_count_fragment_it->getNumTuples();
105  if (is_agg) {
106  global_max_groups_buffer_entry_guess = std::min(
107  2 * global_max_groups_buffer_entry_guess, static_cast<int64_t>(100'000'000));
108  }
109 
110  auto query_comp_desc = std::make_unique<QueryCompilationDescriptor>();
111  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
112  {
113  auto clock_begin = timer_start();
114  std::lock_guard<std::mutex> compilation_lock(compilation_mutex_);
115  compilation_queue_time_ms_ += timer_stop(clock_begin);
116 
117  query_mem_desc = query_comp_desc->compile(global_max_groups_buffer_entry_guess,
118  8,
119  /*has_cardinality_estimation=*/true,
120  ra_exe_unit,
121  table_infos,
122  deleted_cols_map,
123  column_fetcher,
124  co,
125  eo,
126  nullptr,
127  this);
128  }
129  CHECK(query_mem_desc);
130 
131  for (size_t fragment_index = 0; fragment_index < outer_fragments.size();
132  ++fragment_index) {
133  const int64_t crt_fragment_tuple_count =
134  outer_fragments[fragment_index].getNumTuples();
135  if (crt_fragment_tuple_count == 0) {
136  // nothing to update
137  continue;
138  }
139 
140  SharedKernelContext shared_context(table_infos);
141  fragments[0] = {table_id, {fragment_index}};
142  {
143  ExecutionKernel current_fragment_kernel(ra_exe_unit,
145  0,
146  eo,
147  column_fetcher,
148  *query_comp_desc,
149  *query_mem_desc,
150  fragments,
152  /*render_info=*/nullptr,
153  /*rowid_lookup_key=*/-1);
154 
155  auto clock_begin = timer_start();
156  std::lock_guard<std::mutex> kernel_lock(kernel_mutex_);
157  kernel_queue_time_ms_ += timer_stop(clock_begin);
158 
159  current_fragment_kernel.run(this, shared_context);
160  }
161  const auto& proj_fragment_results = shared_context.getFragmentResults();
162  if (proj_fragment_results.empty()) {
163  continue;
164  }
165  const auto& proj_fragment_result = proj_fragment_results[0];
166  const auto proj_result_set = proj_fragment_result.first;
167  CHECK(proj_result_set);
168  cb({outer_fragments[fragment_index], fragment_index, proj_result_set});
169  }
170 }
bool is_agg(const Analyzer::Expr *expr)
#define CHECK_EQ(x, y)
Definition: Logger.h:205
int64_t kernel_queue_time_ms_
Definition: Execute.h:1032
int64_t compilation_queue_time_ms_
Definition: Execute.h:1033
std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > addDeletedColumn(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
Definition: Execute.cpp:3247
std::vector< InputDescriptor > input_descs
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
static std::mutex kernel_mutex_
Definition: Execute.h:1085
#define CHECK_GT(x, y)
Definition: Logger.h:209
const ExecutorId executor_id_
Definition: Execute.h:1028
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
#define CHECK(condition)
Definition: Logger.h:197
static std::mutex compilation_mutex_
Definition: Execute.h:1084
std::vector< size_t > fragment_ids
#define VLOG(n)
Definition: Logger.h:291
Type timer_start()
Definition: measure.h:42

+ Here is the call graph for this function:

ResultSetPtr Executor::executeWorkUnit ( size_t &  max_groups_buffer_entry_guess,
const bool  is_agg,
const std::vector< InputTableInfo > &  query_infos,
const RelAlgExecutionUnit ra_exe_unit_in,
const CompilationOptions co,
const ExecutionOptions options,
const Catalog_Namespace::Catalog cat,
RenderInfo render_info,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache 
)

Definition at line 1345 of file Execute.cpp.

References executor_id_(), ExecutionOptions::just_validate, CompilationRetryNewScanLimit::new_scan_limit_, anonymous_namespace{Execute.cpp}::replace_scan_limit(), run_benchmark_import::result, and VLOG.

1354  {
1355  VLOG(1) << "Executor " << executor_id_ << " is executing work unit:" << ra_exe_unit_in;
1356 
1357  ScopeGuard cleanup_post_execution = [this] {
1358  // cleanup/unpin GPU buffer allocations
1359  // TODO: separate out this state into a single object
1360  plan_state_.reset(nullptr);
1361  if (cgen_state_) {
1362  cgen_state_->in_values_bitmaps_.clear();
1363  }
1364  };
1365 
1366  try {
1367  auto result = executeWorkUnitImpl(max_groups_buffer_entry_guess,
1368  is_agg,
1369  true,
1370  query_infos,
1371  ra_exe_unit_in,
1372  co,
1373  eo,
1374  cat,
1376  render_info,
1377  has_cardinality_estimation,
1378  column_cache);
1379  if (result) {
1380  result->setKernelQueueTime(kernel_queue_time_ms_);
1381  result->addCompilationQueueTime(compilation_queue_time_ms_);
1382  if (eo.just_validate) {
1383  result->setValidationOnlyRes();
1384  }
1385  }
1386  return result;
1387  } catch (const CompilationRetryNewScanLimit& e) {
1388  auto result =
1389  executeWorkUnitImpl(max_groups_buffer_entry_guess,
1390  is_agg,
1391  false,
1392  query_infos,
1393  replace_scan_limit(ra_exe_unit_in, e.new_scan_limit_),
1394  co,
1395  eo,
1396  cat,
1398  render_info,
1399  has_cardinality_estimation,
1400  column_cache);
1401  if (result) {
1402  result->setKernelQueueTime(kernel_queue_time_ms_);
1403  result->addCompilationQueueTime(compilation_queue_time_ms_);
1404  if (eo.just_validate) {
1405  result->setValidationOnlyRes();
1406  }
1407  }
1408  return result;
1409  }
1410 }
bool is_agg(const Analyzer::Expr *expr)
int64_t kernel_queue_time_ms_
Definition: Execute.h:1032
int64_t compilation_queue_time_ms_
Definition: Execute.h:1033
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:985
const ExecutorId executor_id_
Definition: Execute.h:1028
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:1001
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1000
RelAlgExecutionUnit replace_scan_limit(const RelAlgExecutionUnit &ra_exe_unit_in, const size_t new_scan_limit)
Definition: Execute.cpp:1326
ResultSetPtr executeWorkUnitImpl(size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, const Catalog_Namespace::Catalog &, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
Definition: Execute.cpp:1412
#define VLOG(n)
Definition: Logger.h:291

+ Here is the call graph for this function:

ResultSetPtr Executor::executeWorkUnitImpl ( size_t &  max_groups_buffer_entry_guess,
const bool  is_agg,
const bool  allow_single_frag_table_opt,
const std::vector< InputTableInfo > &  query_infos,
const RelAlgExecutionUnit ra_exe_unit_in,
const CompilationOptions co,
const ExecutionOptions options,
const Catalog_Namespace::Catalog cat,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
RenderInfo render_info,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache 
)
private

Definition at line 1412 of file Execute.cpp.

References CompilationOptions::allow_lazy_fetch, ExecutionOptions::allow_runtime_query_interrupt, catalog_(), CHECK, anonymous_namespace{Execute.cpp}::compute_buffer_entry_guess(), CPU, cpu_threads(), CompilationOptions::device_type, ExecutionOptions::executor_type, CompilationOptions::explain_type, CompilationOptions::filter_on_deleted_column, g_use_tbb_pool, get_available_gpus(), get_context_count(), get_min_byte_width(), QueryExecutionError::getErrorCode(), CompilationOptions::hoist_literals, INJECT_TIMER, ExecutionOptions::just_explain, ExecutionOptions::just_validate, MAX_BYTE_WIDTH_SUPPORTED, Native, CompilationOptions::opt_level, Projection, CompilationOptions::register_intel_jit_listener, timer_start(), timer_stop(), VLOG, CompilationOptions::with_dynamic_watchdog, and ExecutionOptions::with_dynamic_watchdog.

1424  {
1425  INJECT_TIMER(Exec_executeWorkUnit);
1426  const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
1427  const auto device_type = getDeviceTypeForTargets(ra_exe_unit, co.device_type);
1428  CHECK(!query_infos.empty());
1429  if (!max_groups_buffer_entry_guess) {
1430  // The query has failed the first execution attempt because of running out
1431  // of group by slots. Make the conservative choice: allocate fragment size
1432  // slots and run on the CPU.
1433  CHECK(device_type == ExecutorDeviceType::CPU);
1434  max_groups_buffer_entry_guess = compute_buffer_entry_guess(query_infos);
1435  }
1436 
1437  int8_t crt_min_byte_width{get_min_byte_width()};
1438  do {
1439  SharedKernelContext shared_context(query_infos);
1440  ColumnFetcher column_fetcher(this, column_cache);
1441  auto query_comp_desc_owned = std::make_unique<QueryCompilationDescriptor>();
1442  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc_owned;
1443  if (eo.executor_type == ExecutorType::Native) {
1444  try {
1445  INJECT_TIMER(query_step_compilation);
1446  auto clock_begin = timer_start();
1447  std::lock_guard<std::mutex> compilation_lock(compilation_mutex_);
1448  compilation_queue_time_ms_ += timer_stop(clock_begin);
1449 
1450  query_mem_desc_owned =
1451  query_comp_desc_owned->compile(max_groups_buffer_entry_guess,
1452  crt_min_byte_width,
1453  has_cardinality_estimation,
1454  ra_exe_unit,
1455  query_infos,
1456  deleted_cols_map,
1457  column_fetcher,
1458  {device_type,
1459  co.hoist_literals,
1460  co.opt_level,
1462  co.allow_lazy_fetch,
1464  co.explain_type,
1466  eo,
1467  render_info,
1468  this);
1469  CHECK(query_mem_desc_owned);
1470  crt_min_byte_width = query_comp_desc_owned->getMinByteWidth();
1471  } catch (CompilationRetryNoCompaction&) {
1472  crt_min_byte_width = MAX_BYTE_WIDTH_SUPPORTED;
1473  continue;
1474  }
1475  } else {
1476  plan_state_.reset(new PlanState(false, query_infos, deleted_cols_map, this));
1477  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
1478  CHECK(!query_mem_desc_owned);
1479  query_mem_desc_owned.reset(
1481  }
1482  if (eo.just_explain) {
1483  return executeExplain(*query_comp_desc_owned);
1484  }
1485 
1486  for (const auto target_expr : ra_exe_unit.target_exprs) {
1487  plan_state_->target_exprs_.push_back(target_expr);
1488  }
1489 
1490  if (!eo.just_validate) {
1491  int available_cpus = cpu_threads();
1492  auto available_gpus = get_available_gpus(cat);
1493 
1494  const auto context_count =
1495  get_context_count(device_type, available_cpus, available_gpus.size());
1496  try {
1497  auto kernels = createKernels(shared_context,
1498  ra_exe_unit,
1499  column_fetcher,
1500  query_infos,
1501  eo,
1502  is_agg,
1503  allow_single_frag_table_opt,
1504  context_count,
1505  *query_comp_desc_owned,
1506  *query_mem_desc_owned,
1507  render_info,
1508  available_gpus,
1509  available_cpus);
1510  if (g_use_tbb_pool) {
1511 #ifdef HAVE_TBB
1512  VLOG(1) << "Using TBB thread pool for kernel dispatch.";
1513  launchKernels<threadpool::TbbThreadPool<void>>(shared_context,
1514  std::move(kernels));
1515 #else
1516  throw std::runtime_error(
1517  "This build is not TBB enabled. Restart the server with "
1518  "\"enable-modern-thread-pool\" disabled.");
1519 #endif
1520  } else {
1521  launchKernels<threadpool::FuturesThreadPool<void>>(shared_context,
1522  std::move(kernels));
1523  }
1524  } catch (QueryExecutionError& e) {
1525  if (eo.with_dynamic_watchdog && interrupted_.load() &&
1526  e.getErrorCode() == ERR_OUT_OF_TIME) {
1527  resetInterrupt();
1529  }
1530  if (eo.allow_runtime_query_interrupt && interrupted_.load()) {
1531  resetInterrupt();
1533  }
1535  static_cast<size_t>(crt_min_byte_width << 1) <= sizeof(int64_t)) {
1536  crt_min_byte_width <<= 1;