OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Executor Class Reference

#include <Execute.h>

+ Collaboration diagram for Executor:

Classes

class  CgenStateManager
 
struct  ExecutorMutexHolder
 
class  FetchCacheAnchor
 
struct  GroupColLLVMValue
 
struct  JoinHashTableOrError
 

Public Types

enum  ExtModuleKinds {
  ExtModuleKinds::template_module, ExtModuleKinds::udf_cpu_module, ExtModuleKinds::udf_gpu_module, ExtModuleKinds::rt_udf_cpu_module,
  ExtModuleKinds::rt_udf_gpu_module, ExtModuleKinds::rt_geos_module, ExtModuleKinds::rt_libdevice_module
}
 
using ExecutorId = size_t
 
using CachedCardinality = std::pair< bool, size_t >
 

Public Member Functions

 Executor (const ExecutorId id, Data_Namespace::DataMgr *data_mgr, const size_t block_size_x, const size_t grid_size_x, const size_t max_gpu_slab_size, const std::string &debug_dir, const std::string &debug_file)
 
void clearCaches (bool runtime_only=false)
 
std::string dumpCache () const
 
void reset (bool discard_runtime_modules_only=false)
 
const std::unique_ptr
< llvm::Module > & 
get_rt_module () const
 
const std::unique_ptr
< llvm::Module > & 
get_udf_module (bool is_gpu=false) const
 
const std::unique_ptr
< llvm::Module > & 
get_rt_udf_module (bool is_gpu=false) const
 
const std::unique_ptr
< llvm::Module > & 
get_geos_module () const
 
const std::unique_ptr
< llvm::Module > & 
get_libdevice_module () const
 
bool has_rt_module () const
 
bool has_udf_module (bool is_gpu=false) const
 
bool has_rt_udf_module (bool is_gpu=false) const
 
bool has_geos_module () const
 
bool has_libdevice_module () const
 
const TemporaryTablesgetTemporaryTables ()
 
StringDictionaryProxygetStringDictionaryProxy (const shared::StringDictKey &dict_key, const bool with_generation) const
 
StringDictionaryProxygetStringDictionaryProxy (const shared::StringDictKey &dict_key, const std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const
 
const
StringDictionaryProxy::IdMap
getStringProxyTranslationMap (const shared::StringDictKey &source_dict_key, const shared::StringDictKey &dest_dict_key, const RowSetMemoryOwner::StringTranslationType translation_type, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const
 
const
StringDictionaryProxy::IdMap
getJoinIntersectionStringProxyTranslationMap (const StringDictionaryProxy *source_proxy, StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &source_string_op_infos, const std::vector< StringOps_Namespace::StringOpInfo > &dest_source_string_op_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner) const
 
const
StringDictionaryProxy::TranslationMap
< Datum > * 
getStringProxyNumericTranslationMap (const shared::StringDictKey &source_dict_key, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool with_generation) const
 
bool isCPUOnly () const
 
bool isArchMaxwell (const ExecutorDeviceType dt) const
 
bool containsLeftDeepOuterJoin () const
 
const ColumnDescriptorgetColumnDescriptor (const Analyzer::ColumnVar *) const
 
const ColumnDescriptorgetPhysicalColumnDescriptor (const Analyzer::ColumnVar *, int) const
 
Data_Namespace::DataMgrgetDataMgr () const
 
const std::shared_ptr
< RowSetMemoryOwner
getRowSetMemoryOwner () const
 
const TemporaryTablesgetTemporaryTables () const
 
Fragmenter_Namespace::TableInfo getTableInfo (const shared::TableKey &table_key) const
 
const TableGenerationgetTableGeneration (const shared::TableKey &table_key) const
 
ExpressionRange getColRange (const PhysicalInput &) const
 
size_t getNumBytesForFetchedRow (const std::set< shared::TableKey > &table_keys_to_fetch) const
 
std::map< shared::ColumnKey,
size_t > 
getColumnByteWidthMap (const std::set< shared::TableKey > &table_ids_to_fetch, const bool include_lazy_fetched_cols) const
 
size_t getNumBytesForFetchedRow (const std::set< int > &table_ids_to_fetch) const
 
ExecutorResourceMgr_Namespace::ChunkRequestInfo getChunkRequestInfo (const ExecutorDeviceType device_type, const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos, const std::vector< std::pair< int32_t, FragmentsList >> &device_fragment_lists) const
 Determines a unique list of chunks and their associated byte sizes for a given query plan. More...
 
bool hasLazyFetchColumns (const std::vector< Analyzer::Expr * > &target_exprs) const
 
std::vector< ColumnLazyFetchInfogetColLazyFetchInfo (const std::vector< Analyzer::Expr * > &target_exprs) const
 
void interrupt (const QuerySessionId &query_session="", const QuerySessionId &interrupt_session="")
 
void resetInterrupt ()
 
void enableRuntimeQueryInterrupt (const double runtime_query_check_freq, const unsigned pending_query_check_freq) const
 
int8_t warpSize () const
 
unsigned gridSize () const
 
void setGridSize (unsigned grid_size)
 
void resetGridSize ()
 
unsigned numBlocksPerMP () const
 
unsigned blockSize () const
 
void setBlockSize (unsigned block_size)
 
void resetBlockSize ()
 
size_t maxGpuSlabSize () const
 
ResultSetPtr executeWorkUnit (size_t &max_groups_buffer_entry_guess, const bool is_agg, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
TableUpdateMetadata executeUpdate (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const TableDescriptor *updated_table_desc, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const UpdateLogForFragment::Callback &cb, const bool is_agg)
 
void addTransientStringLiterals (const RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< RowSetMemoryOwner > &row_set_mem_owner)
 
int deviceCount (const ExecutorDeviceType) const
 
void logSystemCPUMemoryStatus (std::string const &tag, size_t const thread_idx) const
 
void logSystemGPUMemoryStatus (std::string const &tag, size_t const thread_idx) const
 
void setupCaching (const std::unordered_set< PhysicalInput > &phys_inputs, const std::unordered_set< shared::TableKey > &phys_table_keys)
 
void setColRangeCache (const AggregatedColRange &aggregated_col_range)
 
ExecutorId getExecutorId () const
 
QuerySessionIdgetCurrentQuerySession (heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
 
QuerySessionStatus::QueryStatus getQuerySessionStatus (const QuerySessionId &candidate_query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
 
bool checkCurrentQuerySession (const std::string &candidate_query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
 
void invalidateRunningQuerySession (heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
 
bool addToQuerySessionList (const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
 
bool removeFromQuerySessionList (const QuerySessionId &query_session, const std::string &submitted_time_str, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
 
void setQuerySessionAsInterrupted (const QuerySessionId &query_session, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
 
bool checkIsQuerySessionInterrupted (const std::string &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
 
bool checkIsQuerySessionEnrolled (const QuerySessionId &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
 
bool updateQuerySessionStatusWithLock (const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus updated_query_status, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
 
bool updateQuerySessionExecutorAssignment (const QuerySessionId &query_session, const std::string &submitted_time_str, const size_t executor_id, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
 
std::vector< QuerySessionStatusgetQuerySessionInfo (const QuerySessionId &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
 
heavyai::shared_mutexgetSessionLock ()
 
CurrentQueryStatus attachExecutorToQuerySession (const QuerySessionId &query_session_id, const std::string &query_str, const std::string &query_submitted_time)
 
void checkPendingQueryStatus (const QuerySessionId &query_session)
 
void clearQuerySessionStatus (const QuerySessionId &query_session, const std::string &submitted_time_str)
 
void updateQuerySessionStatus (const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus new_query_status)
 
void enrollQuerySession (const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted_time_str, const size_t executor_id, const QuerySessionStatus::QueryStatus query_session_status)
 
size_t getNumCurentSessionsEnrolled () const
 
const std::vector< size_t > getExecutorIdsRunningQuery (const QuerySessionId &interrupt_session) const
 
bool checkNonKernelTimeInterrupted () const
 
void registerExtractedQueryPlanDag (const QueryPlanDAG &query_plan_dag)
 
const QueryPlanDAG getLatestQueryPlanDagExtracted () const
 
void addToCardinalityCache (const CardinalityCacheKey &cache_key, const size_t cache_value)
 
CachedCardinality getCachedCardinality (const CardinalityCacheKey &cache_key)
 
heavyai::shared_mutexgetDataRecyclerLock ()
 
QueryPlanDagCachegetQueryPlanDagCache ()
 
ResultSetRecyclerHoldergetResultSetRecyclerHolder ()
 
CgenStategetCgenStatePtr () const
 
PlanStategetPlanStatePtr () const
 
llvm::LLVMContext & getContext ()
 
void update_extension_modules (bool update_runtime_modules_only=false)
 

Static Public Member Functions

static void clearExternalCaches (bool for_update, const TableDescriptor *td, const int current_db_id)
 
template<typename F >
static void registerExtensionFunctions (F register_extension_functions)
 
static std::shared_ptr< ExecutorgetExecutor (const ExecutorId id, const std::string &debug_dir="", const std::string &debug_file="", const SystemParameters &system_parameters=SystemParameters())
 
static void nukeCacheOfExecutors ()
 
static void clearMemory (const Data_Namespace::MemoryLevel memory_level)
 
static size_t getArenaBlockSize ()
 
static void addUdfIrToModule (const std::string &udf_ir_filename, const bool is_cuda_ir)
 
static void initialize_extension_module_sources ()
 
static void registerActiveModule (void *module, const int device_id)
 
static void unregisterActiveModule (const int device_id)
 
static std::pair< int64_t,
int32_t > 
reduceResults (const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)
 
static void clearCardinalityCache ()
 
static void invalidateCardinalityCacheForTable (const shared::TableKey &table_key)
 
static void update_after_registration (bool update_runtime_modules_only=false)
 
static void init_resource_mgr (const size_t num_cpu_slots, const size_t num_gpu_slots, const size_t cpu_result_mem, const size_t cpu_buffer_pool_mem, const size_t gpu_buffer_pool_mem, const double per_query_max_cpu_slots_ratio, const double per_query_max_cpu_result_mem_ratio, const bool allow_cpu_kernel_concurrency, const bool allow_cpu_gpu_kernel_concurrency, const bool allow_cpu_slot_oversubscription_concurrency, const bool allow_cpu_result_mem_oversubscription, const double max_available_resource_use_ratio)
 
static void pause_executor_queue ()
 
static void resume_executor_queue ()
 
static size_t get_executor_resource_pool_total_resource_quantity (const ExecutorResourceMgr_Namespace::ResourceType resource_type)
 
static
ExecutorResourceMgr_Namespace::ResourcePoolInfo 
get_executor_resource_pool_info ()
 
static void set_executor_resource_pool_resource (const ExecutorResourceMgr_Namespace::ResourceType resource_type, const size_t resource_quantity)
 
static size_t getBaselineThreshold (bool for_count_distinct, ExecutorDeviceType device_type)
 
static const
ExecutorResourceMgr_Namespace::ConcurrentResourceGrantPolicy 
get_concurrent_resource_grant_policy (const ExecutorResourceMgr_Namespace::ResourceType resource_type)
 
static void set_concurrent_resource_grant_policy (const ExecutorResourceMgr_Namespace::ConcurrentResourceGrantPolicy &concurrent_resource_grant_policy)
 

Public Attributes

std::mutex compilation_mutex_
 

Static Public Attributes

static constexpr ExecutorId UNITARY_EXECUTOR_ID = 0
 
static constexpr ExecutorId INVALID_EXECUTOR_ID = SIZE_MAX
 
static std::map
< ExtModuleKinds, std::string > 
extension_module_sources
 
static const int32_t ERR_DIV_BY_ZERO {1}
 
static const int32_t ERR_OUT_OF_GPU_MEM {2}
 
static const int32_t ERR_OUT_OF_SLOTS {3}
 
static const int32_t ERR_UNSUPPORTED_SELF_JOIN {4}
 
static const int32_t ERR_OUT_OF_RENDER_MEM {5}
 
static const int32_t ERR_OUT_OF_CPU_MEM {6}
 
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW {7}
 
static const int32_t ERR_OUT_OF_TIME {9}
 
static const int32_t ERR_INTERRUPTED {10}
 
static const int32_t ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED {11}
 
static const int32_t ERR_TOO_MANY_LITERALS {12}
 
static const int32_t ERR_STRING_CONST_IN_RESULTSET {13}
 
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY {14}
 
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES {15}
 
static const int32_t ERR_GEOS {16}
 
static const int32_t ERR_WIDTH_BUCKET_INVALID_ARGUMENT {17}
 
static std::mutex register_runtime_extension_functions_mutex_
 
static std::mutex kernel_mutex_
 
static const size_t auto_cpu_mem_bytes {size_t(0)}
 
static std::shared_ptr
< ExecutorResourceMgr_Namespace::ExecutorResourceMgr
executor_resource_mgr_ = nullptr
 

Private Types

using PerFragmentCallBack = std::function< void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo &)>
 

Private Member Functions

void clearMetaInfoCache ()
 
int deviceCountForMemoryLevel (const Data_Namespace::MemoryLevel memory_level) const
 
llvm::Value * codegenWindowFunction (const size_t target_index, const CompilationOptions &co)
 
llvm::Value * codegenConditionalAggregateCondValSelector (llvm::Value *cond_lv, SQLAgg const aggKind, CompilationOptions const &co) const
 
llvm::Value * codegenWindowFunctionAggregate (CodeGenerator *code_generator, const CompilationOptions &co)
 
std::pair< llvm::BasicBlock
*, llvm::Value * > 
codegenWindowResetStateControlFlow (CodeGenerator *code_generator, const CompilationOptions &co)
 
void codegenWindowFunctionStateInit (CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *aggregate_state)
 
llvm::Value * codegenWindowFunctionAggregateCalls (llvm::Value *aggregate_state, const CompilationOptions &co)
 
llvm::Value * codegenWindowNavigationFunctionOnFrame (const CompilationOptions &co)
 
llvm::Value * codegenCurrentPartitionIndex (const WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *current_row_pos_lv)
 
llvm::Value * codegenFrameBoundExpr (const Analyzer::WindowFunction *window_func, const Analyzer::WindowFrame *frame_bound, CodeGenerator &code_generator, const CompilationOptions &co)
 
llvm::Value * codegenFrameBound (bool for_start_bound, bool for_range_mode, bool for_window_frame_naviation, const Analyzer::WindowFrame *frame_bound, bool is_timestamp_type_frame, llvm::Value *order_key_null_val, const WindowFrameBoundFuncArgs &args)
 
std::pair< std::string,
llvm::Value * > 
codegenLoadOrderKeyBufPtr (WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co) const
 
std::pair< llvm::Value
*, llvm::Value * > 
codegenFrameNullRange (WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *partition_index_lv) const
 
WindowPartitionBufferPtrs codegenLoadPartitionBuffers (WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *partition_index_lv) const
 
std::pair< llvm::Value
*, llvm::Value * > 
codegenWindowFrameBounds (WindowFunctionContext *window_func_context, const Analyzer::WindowFrame *frame_start_bound, const Analyzer::WindowFrame *frame_end_bound, llvm::Value *order_key_col_null_val_lv, WindowFrameBoundFuncArgs &args, CodeGenerator &code_generator)
 
std::pair< llvm::Value
*, llvm::Value * > 
codegenFrameBoundRange (const Analyzer::WindowFunction *window_func, CodeGenerator &code_generator, const CompilationOptions &co)
 
std::vector< llvm::Value * > prepareRowModeFuncArgs (bool for_start_bound, SqlWindowFrameBoundType bound_type, const WindowFrameBoundFuncArgs &args) const
 
std::vector< llvm::Value * > prepareRangeModeFuncArgs (bool for_start_bound, const Analyzer::WindowFrame *frame_bound, bool is_timestamp_type_frame, llvm::Value *order_key_null_val, const WindowFrameBoundFuncArgs &frame_args) const
 
const std::string getOrderKeyTypeName (WindowFunctionContext *window_func_context) const
 
llvm::Value * codegenLoadCurrentValueFromColBuf (WindowFunctionContext *window_func_context, CodeGenerator &code_generator, WindowFrameBoundFuncArgs &args) const
 
size_t getOrderKeySize (WindowFunctionContext *window_func_context) const
 
const SQLTypeInfo getFirstOrderColTypeInfo (WindowFunctionContext *window_func_context) const
 
std::string getFramingFuncName (const std::string &bound_type, const std::string &order_col_type, const std::string &op_type, bool for_timestamp_type) const
 
void codegenWindowAvgEpilogue (CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *crt_val, llvm::Value *window_func_null_val)
 
llvm::Value * codegenAggregateWindowState (CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *aggregate_state)
 
llvm::Value * aggregateWindowStatePtr (CodeGenerator *code_generator, const CompilationOptions &co)
 
CudaMgr_Namespace::CudaMgrcudaMgr () const
 
bool isArchPascalOrLater (const ExecutorDeviceType dt) const
 
bool needFetchAllFragments (const InputColDescriptor &col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments) const
 
bool needLinearizeAllFragments (const ColumnDescriptor *cd, const InputColDescriptor &inner_col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments, const Data_Namespace::MemoryLevel memory_level) const
 
void executeWorkUnitPerFragment (const RelAlgExecutionUnit &ra_exe_unit, const InputTableInfo &table_info, const CompilationOptions &co, const ExecutionOptions &eo, const Catalog_Namespace::Catalog &cat, PerFragmentCallBack &cb, const std::set< size_t > &fragment_indexes_param)
 Compiles and dispatches a work unit per fragment processing results with the per fragment callback. Currently used for computing metrics over fragments (metadata). More...
 
ResultSetPtr executeExplain (const QueryCompilationDescriptor &)
 
ResultSetPtr executeTableFunction (const TableFunctionExecutionUnit exe_unit, const std::vector< InputTableInfo > &table_infos, const CompilationOptions &co, const ExecutionOptions &eo)
 Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps. More...
 
ExecutorDeviceType getDeviceTypeForTargets (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType requested_device_type)
 
ResultSetPtr collectAllDeviceResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
 
ResultSetPtr collectAllDeviceShardedTopResults (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type) const
 
std::unordered_map
< shared::TableKey, const
Analyzer::BinOper * > 
getInnerTabIdToJoinCond () const
 
std::vector< std::unique_ptr
< ExecutionKernel > > 
createKernels (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, ColumnFetcher &column_fetcher, const std::vector< InputTableInfo > &table_infos, const ExecutionOptions &eo, const bool is_agg, const bool allow_single_frag_table_opt, const size_t context_count, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, RenderInfo *render_info, std::unordered_set< int > &available_gpus, int &available_cpus)
 
void launchKernelsImpl (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type, const size_t requested_num_threads)
 
void launchKernelsLocked (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type)
 
void launchKernelsViaResourceMgr (SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type, const std::vector< InputDescriptor > &input_descs, const QueryMemoryDescriptor &query_mem_desc)
 Launches a vector of kernels for a given query step, gated/scheduled by ExecutorResourceMgr. More...
 
std::vector< size_t > getTableFragmentIndices (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type, const size_t table_idx, const size_t outer_frag_idx, std::map< shared::TableKey, const TableFragments * > &selected_tables_fragments, const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &inner_table_id_to_join_condition)
 
bool skipFragmentPair (const Fragmenter_Namespace::FragmentInfo &outer_fragment_info, const Fragmenter_Namespace::FragmentInfo &inner_fragment_info, const int inner_table_id, const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &inner_table_id_to_join_condition, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
 
FetchResult fetchChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< shared::TableKey, const TableFragments * > &, const FragmentsList &selected_fragments, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)
 
FetchResult fetchUnionChunks (const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< shared::TableKey, const TableFragments * > &, const FragmentsList &selected_fragments, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)
 
std::pair< std::vector
< std::vector< int64_t >
>, std::vector< std::vector
< uint64_t > > > 
getRowCountAndOffsetForAllFrags (const RelAlgExecutionUnit &ra_exe_unit, const CartesianProduct< std::vector< std::vector< size_t >>> &frag_ids_crossjoin, const std::vector< InputDescriptor > &input_descs, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments)
 
void buildSelectedFragsMapping (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
void buildSelectedFragsMappingForUnion (std::vector< std::vector< size_t >> &selected_fragments_crossjoin, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< size_t > getFragmentCount (const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
 
int32_t executePlanWithGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr *results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const shared::TableKey &outer_table_key, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const bool optimize_cuda_block_and_grid_sizes, const int64_t rows_to_process=-1)
 
int32_t executePlanWithoutGroupBy (const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr *results, const std::vector< Analyzer::Expr * > &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, QueryExecutionContext *query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const bool optimize_cuda_block_and_grid_sizes, const int64_t rows_to_process=-1)
 
ResultSetPtr resultsUnion (SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< int8_t * > getJoinHashTablePtrs (const ExecutorDeviceType device_type, const int device_id)
 
ResultSetPtr reduceMultiDeviceResults (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
std::vector< std::pair
< ResultSetPtr, std::vector
< size_t > > > 
getUniqueThreadSharedResultSets (const std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &results_per_device) const
 
ResultSetPtr reduceMultiDeviceResultSets (std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr reduceSpeculativeTopN (const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
 
ResultSetPtr executeWorkUnitImpl (size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
 
std::vector< llvm::Value * > inlineHoistedLiterals ()
 
void AutoTrackBuffersInRuntimeIR ()
 
std::tuple< CompilationResult,
std::unique_ptr
< QueryMemoryDescriptor > > 
compileWorkUnit (const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
 
llvm::BasicBlock * codegenSkipDeletedOuterTableRow (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
std::vector< JoinLoopbuildJoinLoops (RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
 
JoinLoop::HoistedFiltersCallback buildHoistLeftHandSideFiltersCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const shared::TableKey &inner_table_key, const CompilationOptions &co)
 
std::function< llvm::Value
*(const std::vector
< llvm::Value * >
&, llvm::Value *)> 
buildIsDeletedCb (const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
 
std::shared_ptr< HashJoinbuildCurrentLevelHashTable (const JoinCondition &current_level_join_conditions, size_t level_idx, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
 
void redeclareFilterFunction ()
 
llvm::Value * addJoinLoopIterator (const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
 
void codegenJoinLoops (const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
 
bool compileBody (const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
 
void createErrorCheckControlFlow (llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
 
void insertErrorCodeChecker (llvm::Function *query_func, unsigned const error_code_idx, bool hoist_literals, bool allow_runtime_query_interrupt)
 
void preloadFragOffsets (const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
 
JoinHashTableOrError buildHashTableForQualifier (const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, ColumnCacheMap &column_cache, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
 
void nukeOldState (const bool allow_lazy_fetch, const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit *ra_exe_unit)
 
std::shared_ptr
< CompilationContext
optimizeAndCodegenCPU (llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
 
std::shared_ptr
< CompilationContext
optimizeAndCodegenGPU (llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
 
std::string generatePTX (const std::string &) const
 
void initializeNVPTXBackend () const
 
int64_t deviceCycles (int milliseconds) const
 
GroupColLLVMValue groupByColumnCodegen (Analyzer::Expr *group_by_col, const size_t col_width, const CompilationOptions &, const bool translate_null_val, const int64_t translated_null_val, DiamondCodegen &, std::stack< llvm::BasicBlock * > &, const bool thread_mem_shared)
 
llvm::Value * castToFP (llvm::Value *, SQLTypeInfo const &from_ti, SQLTypeInfo const &to_ti)
 
llvm::Value * castToIntPtrTyIn (llvm::Value *val, const size_t bit_width)
 
std::tuple
< RelAlgExecutionUnit,
PlanState::DeletedColumnsMap
addDeletedColumn (const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
 
bool isFragmentFullyDeleted (const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &fragment)
 
FragmentSkipStatus canSkipFragmentForFpQual (const Analyzer::BinOper *comp_expr, const Analyzer::ColumnVar *lhs_col, const Fragmenter_Namespace::FragmentInfo &fragment, const Analyzer::Constant *rhs_const) const
 
std::pair< bool, int64_t > skipFragment (const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &frag_info, const std::list< std::shared_ptr< Analyzer::Expr >> &simple_quals, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
std::pair< bool, int64_t > skipFragmentInnerJoins (const InputDescriptor &table_desc, const RelAlgExecutionUnit &ra_exe_unit, const Fragmenter_Namespace::FragmentInfo &fragment, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
 
AggregatedColRange computeColRangesCache (const std::unordered_set< PhysicalInput > &phys_inputs)
 
StringDictionaryGenerations computeStringDictionaryGenerations (const std::unordered_set< PhysicalInput > &phys_inputs)
 
TableGenerations computeTableGenerations (const std::unordered_set< shared::TableKey > &phys_table_keys)
 
std::vector< int8_t > serializeLiterals (const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
 
const std::unique_ptr
< llvm::Module > & 
get_extension_module (ExtModuleKinds kind) const
 
bool has_extension_module (ExtModuleKinds kind) const
 
llvm::Value * spillDoubleElement (llvm::Value *elem_val, llvm::Type *elem_ty)
 
ExecutorMutexHolder acquireExecuteMutex ()
 

Static Private Member Functions

static size_t align (const size_t off_in, const size_t alignment)
 

Private Attributes

const ExecutorId executor_id_
 
std::unique_ptr
< llvm::LLVMContext > 
context_
 
std::unique_ptr< CgenStatecgen_state_
 
std::map< ExtModuleKinds,
std::unique_ptr< llvm::Module > > 
extension_modules_
 
std::unique_ptr< PlanStateplan_state_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::mutex gpu_exec_mutex_ [max_gpu_count]
 
std::atomic< bool > interrupted_ {false}
 
std::mutex str_dict_mutex_
 
std::unique_ptr
< llvm::TargetMachine > 
nvptx_target_machine_
 
unsigned block_size_x_
 
unsigned grid_size_x_
 
const size_t max_gpu_slab_size_
 
const std::string debug_dir_
 
const std::string debug_file_
 
Data_Namespace::DataMgrdata_mgr_
 
const TemporaryTablestemporary_tables_
 
TableIdToNodeMap table_id_to_node_map_
 
int64_t kernel_queue_time_ms_ = 0
 
int64_t compilation_queue_time_ms_ = 0
 
std::unique_ptr
< WindowProjectNodeContext
window_project_node_context_owned_
 
WindowFunctionContextactive_window_function_ {nullptr}
 
InputTableInfoCache input_table_info_cache_
 
AggregatedColRange agg_col_range_cache_
 
TableGenerations table_generations_
 
QuerySessionId current_query_session_
 

Static Private Attributes

static const int max_gpu_count
 
static const size_t auto_num_threads {size_t(0)}
 
static std::mutex gpu_active_modules_mutex_
 
static uint32_t gpu_active_modules_device_mask_ {0x0}
 
static void * gpu_active_modules_ [max_gpu_count]
 
static const size_t baseline_threshold
 
static heavyai::shared_mutex executor_session_mutex_
 
static InterruptFlagMap queries_interrupt_flag_
 
static QuerySessionMap queries_session_map_
 
static std::map< int,
std::shared_ptr< Executor > > 
executors_
 
static heavyai::shared_mutex execute_mutex_
 
static heavyai::shared_mutex executors_cache_mutex_
 
static QueryPlanDagCache query_plan_dag_cache_
 
static heavyai::shared_mutex recycler_mutex_
 
static std::unordered_map
< CardinalityCacheKey, size_t > 
cardinality_cache_
 
static ResultSetRecyclerHolder resultset_recycler_holder_
 
static QueryPlanDAG latest_query_plan_extracted_ {EMPTY_QUERY_PLAN}
 

Friends

class BaselineJoinHashTable
 
class CodeGenerator
 
class ColumnFetcher
 
struct DiamondCodegen
 
class ExecutionKernel
 
class KernelSubtask
 
class HashJoin
 
class BoundingBoxIntersectJoinHashTable
 
class RangeJoinHashTable
 
class GroupByAndAggregate
 
class QueryCompilationDescriptor
 
class QueryMemoryDescriptor
 
class QueryMemoryInitializer
 
class QueryFragmentDescriptor
 
class QueryExecutionContext
 
class ResultSet
 
class InValuesBitmap
 
class StringDictionaryTranslationMgr
 
class LeafAggregator
 
class PerfectJoinHashTable
 
class QueryRewriter
 
class PendingExecutionClosure
 
class RelAlgExecutor
 
class TableOptimizer
 
class TableFunctionCompilationContext
 
class TableFunctionExecutionContext
 
struct TargetExprCodegenBuilder
 
struct TargetExprCodegen
 
class WindowProjectNodeContext
 

Detailed Description

Definition at line 415 of file Execute.h.

Member Typedef Documentation

using Executor::CachedCardinality = std::pair<bool, size_t>

Definition at line 1403 of file Execute.h.

using Executor::ExecutorId = size_t

Definition at line 422 of file Execute.h.

Definition at line 890 of file Execute.h.

Member Enumeration Documentation

Enumerator
template_module 
udf_cpu_module 
udf_gpu_module 
rt_udf_cpu_module 
rt_udf_gpu_module 
rt_geos_module 
rt_libdevice_module 

Definition at line 518 of file Execute.h.

518  {
519  template_module, // RuntimeFunctions.bc
520  udf_cpu_module, // Load-time UDFs for CPU execution
521  udf_gpu_module, // Load-time UDFs for GPU execution
522  rt_udf_cpu_module, // Run-time UDF/UDTFs for CPU execution
523  rt_udf_gpu_module, // Run-time UDF/UDTFs for GPU execution
524  rt_geos_module, // geos functions
525  rt_libdevice_module // math library functions for GPU execution
526  };
std::unique_ptr< llvm::Module > udf_gpu_module
std::unique_ptr< llvm::Module > udf_cpu_module

Constructor & Destructor Documentation

Executor::Executor ( const ExecutorId  id,
Data_Namespace::DataMgr data_mgr,
const size_t  block_size_x,
const size_t  grid_size_x,
const size_t  max_gpu_slab_size,
const std::string &  debug_dir,
const std::string &  debug_file 
)

Definition at line 272 of file Execute.cpp.

279  : executor_id_(executor_id)
280  , context_(new llvm::LLVMContext())
281  , cgen_state_(new CgenState({}, false, this))
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
const ExecutorId executor_id_
Definition: Execute.h:1476
std::unique_ptr< llvm::LLVMContext > context_
Definition: Execute.h:1477

Member Function Documentation

ExecutorMutexHolder Executor::acquireExecuteMutex ( )
inlineprivate

Definition at line 1591 of file Execute.h.

References execute_mutex_, executor_id_, Executor::ExecutorMutexHolder::shared_lock, Executor::ExecutorMutexHolder::unique_lock, and UNITARY_EXECUTOR_ID.

1591  {
1592  ExecutorMutexHolder ret;
1594  // Only one unitary executor can run at a time
1596  } else {
1598  }
1599  return ret;
1600  }
static heavyai::shared_mutex execute_mutex_
Definition: Execute.h:1585
std::shared_lock< T > shared_lock
const ExecutorId executor_id_
Definition: Execute.h:1476
std::unique_lock< T > unique_lock
static constexpr ExecutorId UNITARY_EXECUTOR_ID
Definition: Execute.h:423
std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > Executor::addDeletedColumn ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 4441 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::add_deleted_col_to_map(), CHECK, CompilationOptions::filter_on_deleted_column, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), and TABLE.

Referenced by executeWorkUnitImpl(), and executeWorkUnitPerFragment().

4443  {
4444  if (!co.filter_on_deleted_column) {
4445  return std::make_tuple(ra_exe_unit, PlanState::DeletedColumnsMap{});
4446  }
4447  auto ra_exe_unit_with_deleted = ra_exe_unit;
4448  PlanState::DeletedColumnsMap deleted_cols_map;
4449  for (const auto& input_table : ra_exe_unit_with_deleted.input_descs) {
4450  if (input_table.getSourceType() != InputSourceType::TABLE) {
4451  continue;
4452  }
4453  const auto& table_key = input_table.getTableKey();
4454  const auto catalog =
4456  CHECK(catalog);
4457  const auto td = catalog->getMetadataForTable(table_key.table_id);
4458  CHECK(td);
4459  const auto deleted_cd = catalog->getDeletedColumnIfRowsDeleted(td);
4460  if (!deleted_cd) {
4461  continue;
4462  }
4463  CHECK(deleted_cd->columnType.is_boolean());
4464  // check deleted column is not already present
4465  bool found = false;
4466  for (const auto& input_col : ra_exe_unit_with_deleted.input_col_descs) {
4467  if (input_col.get()->getColId() == deleted_cd->columnId &&
4468  input_col.get()->getScanDesc().getTableKey() == table_key &&
4469  input_col.get()->getScanDesc().getNestLevel() == input_table.getNestLevel()) {
4470  found = true;
4471  add_deleted_col_to_map(deleted_cols_map, deleted_cd, table_key);
4472  break;
4473  }
4474  }
4475  if (!found) {
4476  // add deleted column
4477  ra_exe_unit_with_deleted.input_col_descs.emplace_back(
4478  new InputColDescriptor(deleted_cd->columnId,
4479  deleted_cd->tableId,
4480  table_key.db_id,
4481  input_table.getNestLevel()));
4482  add_deleted_col_to_map(deleted_cols_map, deleted_cd, table_key);
4483  }
4484  }
4485  return std::make_tuple(ra_exe_unit_with_deleted, deleted_cols_map);
4486 }
std::unordered_map< shared::TableKey, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
static SysCatalog & instance()
Definition: SysCatalog.h:343
void add_deleted_col_to_map(PlanState::DeletedColumnsMap &deleted_cols_map, const ColumnDescriptor *deleted_cd, const shared::TableKey &table_key)
Definition: Execute.cpp:4429
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * Executor::addJoinLoopIterator ( const std::vector< llvm::Value * > &  prev_iters,
const size_t  level_idx 
)
private

Definition at line 1185 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, and CHECK.

1186  {
1188  // Iterators are added for loop-outer joins when the head of the loop is generated,
1189  // then once again when the body if generated. Allow this instead of special handling
1190  // of call sites.
1191  const auto it = cgen_state_->scan_idx_to_hash_pos_.find(level_idx);
1192  if (it != cgen_state_->scan_idx_to_hash_pos_.end()) {
1193  return it->second;
1194  }
1195  CHECK(!prev_iters.empty());
1196  llvm::Value* matching_row_index = prev_iters.back();
1197  const auto it_ok =
1198  cgen_state_->scan_idx_to_hash_pos_.emplace(level_idx, matching_row_index);
1199  CHECK(it_ok.second);
1200  return matching_row_index;
1201 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:291
void Executor::addToCardinalityCache ( const CardinalityCacheKey cache_key,
const size_t  cache_value 
)

Definition at line 5255 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, recycler_mutex_, and VLOG.

5256  {
5259  cardinality_cache_[cache_key] = cache_value;
5260  VLOG(1) << "Put estimated cardinality to the cache";
5261  }
5262 }
std::unique_lock< T > unique_lock
static std::unordered_map< CardinalityCacheKey, size_t > cardinality_cache_
Definition: Execute.h:1607
static heavyai::shared_mutex recycler_mutex_
Definition: Execute.h:1605
bool g_use_estimator_result_cache
Definition: Execute.cpp:135
#define VLOG(n)
Definition: Logger.h:388
bool Executor::addToQuerySessionList ( const QuerySessionId query_session,
const std::string &  query_str,
const std::string &  submitted,
const size_t  executor_id,
const QuerySessionStatus::QueryStatus  query_status,
heavyai::unique_lock< heavyai::shared_mutex > &  write_lock 
)

Definition at line 5086 of file Execute.cpp.

References queries_interrupt_flag_, and queries_session_map_.

Referenced by enrollQuerySession().

5092  {
5093  // an internal API that enrolls the query session into the Executor's session map
5094  if (queries_session_map_.count(query_session)) {
5095  if (queries_session_map_.at(query_session).count(submitted_time_str)) {
5096  queries_session_map_.at(query_session).erase(submitted_time_str);
5097  queries_session_map_.at(query_session)
5098  .emplace(submitted_time_str,
5099  QuerySessionStatus(query_session,
5100  executor_id,
5101  query_str,
5102  submitted_time_str,
5103  query_status));
5104  } else {
5105  queries_session_map_.at(query_session)
5106  .emplace(submitted_time_str,
5107  QuerySessionStatus(query_session,
5108  executor_id,
5109  query_str,
5110  submitted_time_str,
5111  query_status));
5112  }
5113  } else {
5114  std::map<std::string, QuerySessionStatus> executor_per_query_map;
5115  executor_per_query_map.emplace(
5116  submitted_time_str,
5118  query_session, executor_id, query_str, submitted_time_str, query_status));
5119  queries_session_map_.emplace(query_session, executor_per_query_map);
5120  }
5121  return queries_interrupt_flag_.emplace(query_session, false).second;
5122 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1578

+ Here is the caller graph for this function:

void Executor::addTransientStringLiterals ( const RelAlgExecutionUnit ra_exe_unit,
const std::shared_ptr< RowSetMemoryOwner > &  row_set_mem_owner 
)

Definition at line 2494 of file Execute.cpp.

References CHECK, getStringDictionaryProxy(), RelAlgExecutionUnit::groupby_exprs, kENCODING_DICT, kMODE, kSAMPLE, kSINGLE_VALUE, RelAlgExecutionUnit::quals, RelAlgExecutionUnit::simple_quals, RelAlgExecutionUnit::target_exprs, RelAlgExecutionUnit::target_exprs_union, and ScalarExprVisitor< T >::visit().

2496  {
2497  TransientDictIdVisitor dict_id_visitor;
2498 
2499  auto visit_expr =
2500  [this, &dict_id_visitor, &row_set_mem_owner](const Analyzer::Expr* expr) {
2501  if (!expr) {
2502  return;
2503  }
2504  const auto& dict_key = dict_id_visitor.visit(expr);
2505  if (dict_key.dict_id >= 0) {
2506  auto sdp = getStringDictionaryProxy(dict_key, row_set_mem_owner, true);
2507  CHECK(sdp);
2508  TransientStringLiteralsVisitor visitor(sdp, this);
2509  visitor.visit(expr);
2510  }
2511  };
2512 
2513  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2514  visit_expr(group_expr.get());
2515  }
2516 
2517  for (const auto& group_expr : ra_exe_unit.quals) {
2518  visit_expr(group_expr.get());
2519  }
2520 
2521  for (const auto& group_expr : ra_exe_unit.simple_quals) {
2522  visit_expr(group_expr.get());
2523  }
2524 
2525  const auto visit_target_expr = [&](const Analyzer::Expr* target_expr) {
2526  const auto& target_type = target_expr->get_type_info();
2527  if (!target_type.is_string() || target_type.get_compression() == kENCODING_DICT) {
2528  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
2529  if (agg_expr) {
2530  // The following agg types require taking into account transient string values
2531  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kSINGLE_VALUE ||
2532  agg_expr->get_aggtype() == kSAMPLE || agg_expr->get_aggtype() == kMODE) {
2533  visit_expr(agg_expr->get_arg());
2534  }
2535  } else {
2536  visit_expr(target_expr);
2537  }
2538  }
2539  };
2540  const auto& target_exprs = ra_exe_unit.target_exprs;
2541  std::for_each(target_exprs.begin(), target_exprs.end(), visit_target_expr);
2542  const auto& target_exprs_union = ra_exe_unit.target_exprs_union;
2543  std::for_each(target_exprs_union.begin(), target_exprs_union.end(), visit_target_expr);
2544 }
std::vector< Analyzer::Expr * > target_exprs
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
StringDictionaryProxy * getStringDictionaryProxy(const shared::StringDictKey &dict_key, const bool with_generation) const
Definition: Execute.h:578
std::vector< Analyzer::Expr * > target_exprs_union
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqldefs.h:83
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

void Executor::addUdfIrToModule ( const std::string &  udf_ir_filename,
const bool  is_cuda_ir 
)
static

Definition at line 1960 of file NativeCodegen.cpp.

Referenced by DBHandler::initialize().

1961  {
1965  udf_ir_filename;
1966 }
static std::map< ExtModuleKinds, std::string > extension_module_sources
Definition: Execute.h:528

+ Here is the caller graph for this function:

llvm::Value * Executor::aggregateWindowStatePtr ( CodeGenerator code_generator,
const CompilationOptions co 
)
private

Definition at line 232 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, CodegenUtil::createPtrWithHoistedMemoryAddr(), anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kFLOAT, and WindowFunctionContext::NUM_EXECUTION_DEVICES.

233  {
235  const auto window_func_context =
237  const auto window_func = window_func_context->getWindowFunction();
238  const auto arg_ti = get_adjusted_window_type_info(window_func);
239  llvm::Type* aggregate_state_type =
240  arg_ti.get_type() == kFLOAT
241  ? llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0)
242  : llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
243  const auto aggregate_state_i64 = cgen_state_->llInt(
244  reinterpret_cast<const int64_t>(window_func_context->aggregateState()));
246  cgen_state_.get(),
247  code_generator,
248  co,
249  aggregate_state_i64,
250  aggregate_state_type,
252  .front();
253 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static const int NUM_EXECUTION_DEVICES
std::unique_ptr< llvm::LLVMContext > context_
Definition: Execute.h:1477
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

static size_t Executor::align ( const size_t  off_in,
const size_t  alignment 
)
inlinestaticprivate

Definition at line 1468 of file Execute.h.

Referenced by serializeLiterals().

1468  {
1469  size_t off = off_in;
1470  if (off % alignment != 0) {
1471  off += (alignment - off % alignment);
1472  }
1473  return off;
1474  }

+ Here is the caller graph for this function:

CurrentQueryStatus Executor::attachExecutorToQuerySession ( const QuerySessionId query_session_id,
const std::string &  query_str,
const std::string &  query_submitted_time 
)

Definition at line 4984 of file Execute.cpp.

References executor_id_, executor_session_mutex_, updateQuerySessionExecutorAssignment(), and updateQuerySessionStatusWithLock().

4987  {
4988  if (!query_session_id.empty()) {
4989  // if session is valid, do update 1) the exact executor id and 2) query status
4992  query_session_id, query_submitted_time, executor_id_, write_lock);
4993  updateQuerySessionStatusWithLock(query_session_id,
4994  query_submitted_time,
4995  QuerySessionStatus::QueryStatus::PENDING_EXECUTOR,
4996  write_lock);
4997  }
4998  return {query_session_id, query_str};
4999 }
bool updateQuerySessionStatusWithLock(const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus updated_query_status, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
Definition: Execute.cpp:5124
const ExecutorId executor_id_
Definition: Execute.h:1476
bool updateQuerySessionExecutorAssignment(const QuerySessionId &query_session, const std::string &submitted_time_str, const size_t executor_id, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
Definition: Execute.cpp:5150
std::unique_lock< T > unique_lock
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574

+ Here is the call graph for this function:

void Executor::AutoTrackBuffersInRuntimeIR ( )
private

Definition at line 2304 of file NativeCodegen.cpp.

2304  {
2305  llvm::Module* M = cgen_state_->module_;
2306  if (M->getFunction("allocate_varlen_buffer") == nullptr) {
2307  return;
2308  }
2309 
2310  // read metadata
2311  bool should_track = false;
2312  auto* flag = M->getModuleFlag("manage_memory_buffer");
2313  if (auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
2314  if (cnt->getZExtValue() == 1) {
2315  should_track = true;
2316  }
2317  }
2318 
2319  if (!should_track) {
2320  // metadata is not present
2321  return;
2322  }
2323 
2324  LOG(INFO) << "Found 'manage_memory_buffer' metadata.";
2325  llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
2326 
2327  for (llvm::Function& F : *M) {
2328  for (llvm::BasicBlock& BB : F) {
2329  for (llvm::Instruction& I : BB) {
2330  if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
2331  // Keep track of calls to "allocate_varlen_buffer" for later processing
2332  auto const called_func_name = CodegenUtil::getCalledFunctionName(*CI);
2333  if (called_func_name && *called_func_name == "allocate_varlen_buffer") {
2334  calls_to_analyze.push_back(CI);
2335  }
2336  }
2337  }
2338  }
2339  }
2340 
2341  // for each call to "allocate_varlen_buffer", check if there's a corresponding
2342  // call to "register_buffer_with_executor_rsm". If not, add a call to it
2343  llvm::IRBuilder<> Builder(cgen_state_->context_);
2344  auto i64 = get_int_type(64, cgen_state_->context_);
2345  auto i8p = get_int_ptr_type(8, cgen_state_->context_);
2346  auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2347  llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p}, false);
2348  llvm::FunctionCallee register_buffer_fn =
2349  M->getOrInsertFunction("register_buffer_with_executor_rsm", fnty, {});
2350 
2351  int64_t executor_addr = reinterpret_cast<int64_t>(this);
2352  for (llvm::CallInst* CI : calls_to_analyze) {
2353  bool found = false;
2354  // for each user of the function, check if its a callinst
2355  // and if the callinst is calling "register_buffer_with_executor_rsm"
2356  // if no such instruction exist, add one registering the buffer
2357  for (llvm::User* U : CI->users()) {
2358  if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2359  auto const func_name = CodegenUtil::getCalledFunctionName(*call);
2360  if (func_name && *func_name == "register_buffer_with_executor_rsm") {
2361  found = true;
2362  break;
2363  }
2364  }
2365  }
2366  if (!found) {
2367  Builder.SetInsertPoint(CI->getNextNode());
2368  Builder.CreateCall(register_buffer_fn,
2369  {ll_int(executor_addr, cgen_state_->context_), CI});
2370  }
2371  }
2372 }
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
#define LOG(tag)
Definition: Logger.h:285
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
unsigned Executor::blockSize ( ) const

Definition at line 4332 of file Execute.cpp.

References block_size_x_, CHECK, data_mgr_, CudaMgr_Namespace::CudaMgr::getAllDeviceProperties(), and Data_Namespace::DataMgr::getCudaMgr().

Referenced by collectAllDeviceShardedTopResults(), executePlanWithGroupBy(), executePlanWithoutGroupBy(), executeTableFunction(), executeWorkUnitImpl(), reduceMultiDeviceResults(), reduceMultiDeviceResultSets(), and resultsUnion().

4332  {
4333  CHECK(data_mgr_);
4334  const auto cuda_mgr = data_mgr_->getCudaMgr();
4335  if (!cuda_mgr) {
4336  return 0;
4337  }
4338  const auto& dev_props = cuda_mgr->getAllDeviceProperties();
4339  return block_size_x_ ? block_size_x_ : dev_props.front().maxThreadsPerBlock;
4340 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:235
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1558
unsigned block_size_x_
Definition: Execute.h:1552
#define CHECK(condition)
Definition: Logger.h:291
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:134

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr< HashJoin > Executor::buildCurrentLevelHashTable ( const JoinCondition current_level_join_conditions,
size_t  level_idx,
RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache,
std::vector< std::string > &  fail_reasons 
)
private

Definition at line 1026 of file IRCodegen.cpp.

References anonymous_namespace{IRCodegen.cpp}::add_qualifier_to_execution_unit(), AUTOMATIC_IR_METADATA, anonymous_namespace{IRCodegen.cpp}::check_valid_join_qual(), Data_Namespace::CPU_LEVEL, CompilationOptions::device_type, Executor::JoinHashTableOrError::fail_reason, GPU, Data_Namespace::GPU_LEVEL, Executor::JoinHashTableOrError::hash_table, RelAlgExecutionUnit::hash_table_build_plan_dag, IS_EQUIVALENCE, LEFT, OneToOne, JoinCondition::quals, RelAlgExecutionUnit::query_hint, RelAlgExecutionUnit::table_id_to_node_map, JoinCondition::type, and VLOG.

1033  {
1035  std::shared_ptr<HashJoin> current_level_hash_table;
1036  auto handleNonHashtableQual = [&ra_exe_unit, &level_idx, this](
1037  JoinType join_type,
1038  std::shared_ptr<Analyzer::Expr> qual) {
1039  if (join_type == JoinType::LEFT) {
1040  plan_state_->addNonHashtableQualForLeftJoin(level_idx, qual);
1041  } else {
1042  add_qualifier_to_execution_unit(ra_exe_unit, qual);
1043  }
1044  };
1045  for (const auto& join_qual : current_level_join_conditions.quals) {
1046  auto qual_bin_oper = std::dynamic_pointer_cast<Analyzer::BinOper>(join_qual);
1047  if (current_level_hash_table || !qual_bin_oper ||
1048  !IS_EQUIVALENCE(qual_bin_oper->get_optype())) {
1049  handleNonHashtableQual(current_level_join_conditions.type, join_qual);
1050  if (!current_level_hash_table) {
1051  fail_reasons.emplace_back("No equijoin expression found");
1052  }
1053  continue;
1054  }
1055  check_valid_join_qual(qual_bin_oper);
1056  JoinHashTableOrError hash_table_or_error;
1057  if (!current_level_hash_table) {
1058  hash_table_or_error = buildHashTableForQualifier(
1059  qual_bin_oper,
1060  query_infos,
1063  current_level_join_conditions.type,
1065  column_cache,
1066  ra_exe_unit.hash_table_build_plan_dag,
1067  ra_exe_unit.query_hint,
1068  ra_exe_unit.table_id_to_node_map);
1069  current_level_hash_table = hash_table_or_error.hash_table;
1070  }
1071  if (hash_table_or_error.hash_table) {
1072  plan_state_->join_info_.join_hash_tables_.push_back(hash_table_or_error.hash_table);
1073  plan_state_->join_info_.equi_join_tautologies_.push_back(qual_bin_oper);
1074  } else {
1075  fail_reasons.push_back(hash_table_or_error.fail_reason);
1076  if (!current_level_hash_table) {
1077  VLOG(2) << "Building a hashtable based on a qual " << qual_bin_oper->toString()
1078  << " fails: " << hash_table_or_error.fail_reason;
1079  }
1080  handleNonHashtableQual(current_level_join_conditions.type, qual_bin_oper);
1081  }
1082  }
1083  return current_level_hash_table;
1084 }
JoinType
Definition: sqldefs.h:174
#define IS_EQUIVALENCE(X)
Definition: sqldefs.h:69
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
TableIdToNodeMap table_id_to_node_map
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
void add_qualifier_to_execution_unit(RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< Analyzer::Expr > &qual)
Definition: IRCodegen.cpp:535
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
JoinHashTableOrError buildHashTableForQualifier(const std::shared_ptr< Analyzer::BinOper > &qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, ColumnCacheMap &column_cache, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
Definition: Execute.cpp:4275
std::list< std::shared_ptr< Analyzer::Expr > > quals
RegisteredQueryHint query_hint
#define VLOG(n)
Definition: Logger.h:388
HashTableBuildDagMap hash_table_build_plan_dag
void check_valid_join_qual(std::shared_ptr< Analyzer::BinOper > &bin_oper)
Definition: IRCodegen.cpp:586

+ Here is the call graph for this function:

Executor::JoinHashTableOrError Executor::buildHashTableForQualifier ( const std::shared_ptr< Analyzer::BinOper > &  qual_bin_oper,
const std::vector< InputTableInfo > &  query_infos,
const MemoryLevel  memory_level,
const JoinType  join_type,
const HashType  preferred_hash_type,
ColumnCacheMap column_cache,
const HashTableBuildDagMap hashtable_build_dag_map,
const RegisteredQueryHint query_hint,
const TableIdToNodeMap table_id_to_node_map 
)
private

Definition at line 4275 of file Execute.cpp.

References deviceCountForMemoryLevel(), ERR_INTERRUPTED, g_enable_bbox_intersect_hashjoin, g_enable_dynamic_watchdog, HashJoin::getInstance(), and interrupted_.

4284  {
4285  if (!g_enable_bbox_intersect_hashjoin && qual_bin_oper->is_bbox_intersect_oper()) {
4286  return {nullptr,
4287  "Bounding box intersection disabled, attempting to fall back to loop join"};
4288  }
4289  if (g_enable_dynamic_watchdog && interrupted_.load()) {
4291  }
4292  try {
4293  auto tbl = HashJoin::getInstance(qual_bin_oper,
4294  query_infos,
4295  memory_level,
4296  join_type,
4297  preferred_hash_type,
4298  deviceCountForMemoryLevel(memory_level),
4299  column_cache,
4300  this,
4301  hashtable_build_dag_map,
4302  query_hint,
4303  table_id_to_node_map);
4304  return {tbl, ""};
4305  } catch (const HashJoinFail& e) {
4306  return {nullptr, e.what()};
4307  }
4308 }
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
std::atomic< bool > interrupted_
Definition: Execute.h:1543
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:81
bool g_enable_bbox_intersect_hashjoin
Definition: Execute.cpp:105
int deviceCountForMemoryLevel(const Data_Namespace::MemoryLevel memory_level) const
Definition: Execute.cpp:1305
static std::shared_ptr< HashJoin > getInstance(const std::shared_ptr< Analyzer::BinOper > qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
Make hash table from an in-flight SQL query&#39;s parse tree etc.
Definition: HashJoin.cpp:285

+ Here is the call graph for this function:

JoinLoop::HoistedFiltersCallback Executor::buildHoistLeftHandSideFiltersCb ( const RelAlgExecutionUnit ra_exe_unit,
const size_t  level_idx,
const shared::TableKey inner_table_key,
const CompilationOptions co 
)
private

Definition at line 858 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CodeGenerator::codegen(), g_enable_left_join_filter_hoisting, RelAlgExecutionUnit::join_quals, LEFT, RelAlgExecutionUnit::quals, RelAlgExecutionUnit::simple_quals, CodeGenerator::toBool(), and VLOG.

862  {
864  return nullptr;
865  }
866 
867  const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
868  if (level_idx == 0 && current_level_join_conditions.type == JoinType::LEFT) {
869  const auto& condition = current_level_join_conditions.quals.front();
870  const auto bin_oper = dynamic_cast<const Analyzer::BinOper*>(condition.get());
871  CHECK(bin_oper) << condition->toString();
872  const auto rhs =
873  dynamic_cast<const Analyzer::ColumnVar*>(bin_oper->get_right_operand());
874  const auto lhs =
875  dynamic_cast<const Analyzer::ColumnVar*>(bin_oper->get_left_operand());
876  if (lhs && rhs && lhs->getTableKey() != rhs->getTableKey()) {
877  const Analyzer::ColumnVar* selected_lhs{nullptr};
878  // grab the left hand side column -- this is somewhat similar to normalize column
879  // pair, and a better solution may be to hoist that function out of the join
880  // framework and normalize columns at the top of build join loops
881  if (lhs->getTableKey() == inner_table_id) {
882  selected_lhs = rhs;
883  } else if (rhs->getTableKey() == inner_table_id) {
884  selected_lhs = lhs;
885  }
886  if (selected_lhs) {
887  std::list<std::shared_ptr<Analyzer::Expr>> hoisted_quals;
888  // get all LHS-only filters
889  auto should_hoist_qual = [&hoisted_quals](const auto& qual,
890  const shared::TableKey& table_key) {
891  CHECK(qual);
892 
893  ExprTableIdVisitor visitor;
894  const auto table_keys = visitor.visit(qual.get());
895  if (table_keys.size() == 1 && table_keys.find(table_key) != table_keys.end()) {
896  hoisted_quals.push_back(qual);
897  }
898  };
899  for (const auto& qual : ra_exe_unit.simple_quals) {
900  should_hoist_qual(qual, selected_lhs->getTableKey());
901  }
902  for (const auto& qual : ra_exe_unit.quals) {
903  should_hoist_qual(qual, selected_lhs->getTableKey());
904  }
905 
906  // build the filters callback and return it
907  if (!hoisted_quals.empty()) {
908  return [this, hoisted_quals, co](llvm::BasicBlock* true_bb,
909  llvm::BasicBlock* exit_bb,
910  const std::string& loop_name,
911  llvm::Function* parent_func,
912  CgenState* cgen_state) -> llvm::BasicBlock* {
913  // make sure we have quals to hoist
914  bool has_quals_to_hoist = false;
915  for (const auto& qual : hoisted_quals) {
916  // check to see if the filter was previously hoisted. if all filters were
917  // previously hoisted, this callback becomes a noop
918  if (plan_state_->hoisted_filters_.count(qual) == 0) {
919  has_quals_to_hoist = true;
920  break;
921  }
922  }
923 
924  if (!has_quals_to_hoist) {
925  return nullptr;
926  }
927 
928  AUTOMATIC_IR_METADATA(cgen_state);
929 
930  llvm::IRBuilder<>& builder = cgen_state->ir_builder_;
931  auto& context = builder.getContext();
932 
933  const auto filter_bb =
934  llvm::BasicBlock::Create(context,
935  "hoisted_left_join_filters_" + loop_name,
936  parent_func,
937  /*insert_before=*/true_bb);
938  builder.SetInsertPoint(filter_bb);
939 
940  llvm::Value* filter_lv = cgen_state_->llBool(true);
941  CodeGenerator code_generator(this);
943  for (const auto& qual : hoisted_quals) {
944  if (plan_state_->hoisted_filters_.insert(qual).second) {
945  // qual was inserted into the hoisted filters map, which means we have not
946  // seen this qual before. Generate filter.
947  VLOG(1) << "Generating code for hoisted left hand side qualifier "
948  << qual->toString();
949  auto cond = code_generator.toBool(
950  code_generator.codegen(qual.get(), true, co).front());
951  filter_lv = builder.CreateAnd(filter_lv, cond);
952  }
953  }
954  CHECK(filter_lv->getType()->isIntegerTy(1));
955 
956  builder.CreateCondBr(filter_lv, true_bb, exit_bb);
957  return filter_bb;
958  };
959  }
960  }
961  }
962  }
963  return nullptr;
964 }
bool g_enable_left_join_filter_hoisting
Definition: Execute.cpp:103
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:291
#define VLOG(n)
Definition: Logger.h:388
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> Executor::buildIsDeletedCb ( const RelAlgExecutionUnit ra_exe_unit,
const size_t  level_idx,
const CompilationOptions co 
)
private

Definition at line 967 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_LT, CodeGenerator::codegen(), CompilationOptions::filter_on_deleted_column, RelAlgExecutionUnit::input_descs, TABLE, and CodeGenerator::toBool().

969  {
971  if (!co.filter_on_deleted_column) {
972  return nullptr;
973  }
974  CHECK_LT(level_idx + 1, ra_exe_unit.input_descs.size());
975  const auto input_desc = ra_exe_unit.input_descs[level_idx + 1];
976  if (input_desc.getSourceType() != InputSourceType::TABLE) {
977  return nullptr;
978  }
979 
980  const auto deleted_cd = plan_state_->getDeletedColForTable(input_desc.getTableKey());
981  if (!deleted_cd) {
982  return nullptr;
983  }
984  CHECK(deleted_cd->columnType.is_boolean());
985  const auto deleted_expr = makeExpr<Analyzer::ColumnVar>(
986  deleted_cd->columnType,
987  shared::ColumnKey{input_desc.getTableKey(), deleted_cd->columnId},
988  input_desc.getNestLevel());
989  return [this, deleted_expr, level_idx, &co](const std::vector<llvm::Value*>& prev_iters,
990  llvm::Value* have_more_inner_rows) {
991  const auto matching_row_index = addJoinLoopIterator(prev_iters, level_idx + 1);
992  // Avoid fetching the deleted column from a position which is not valid.
993  // An invalid position can be returned by a one to one hash lookup (negative)
994  // or at the end of iteration over a set of matching values.
995  llvm::Value* is_valid_it{nullptr};
996  if (have_more_inner_rows) {
997  is_valid_it = have_more_inner_rows;
998  } else {
999  is_valid_it = cgen_state_->ir_builder_.CreateICmp(
1000  llvm::ICmpInst::ICMP_SGE, matching_row_index, cgen_state_->llInt<int64_t>(0));
1001  }
1002  const auto it_valid_bb = llvm::BasicBlock::Create(
1003  cgen_state_->context_, "it_valid", cgen_state_->current_func_);
1004  const auto it_not_valid_bb = llvm::BasicBlock::Create(
1005  cgen_state_->context_, "it_not_valid", cgen_state_->current_func_);
1006  cgen_state_->ir_builder_.CreateCondBr(is_valid_it, it_valid_bb, it_not_valid_bb);
1007  const auto row_is_deleted_bb = llvm::BasicBlock::Create(
1008  cgen_state_->context_, "row_is_deleted", cgen_state_->current_func_);
1009  cgen_state_->ir_builder_.SetInsertPoint(it_valid_bb);
1010  CodeGenerator code_generator(this);
1011  const auto row_is_deleted = code_generator.toBool(
1012  code_generator.codegen(deleted_expr.get(), true, co).front());
1013  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
1014  cgen_state_->ir_builder_.SetInsertPoint(it_not_valid_bb);
1015  const auto row_is_deleted_default = cgen_state_->llBool(false);
1016  cgen_state_->ir_builder_.CreateBr(row_is_deleted_bb);
1017  cgen_state_->ir_builder_.SetInsertPoint(row_is_deleted_bb);
1018  auto row_is_deleted_or_default =
1019  cgen_state_->ir_builder_.CreatePHI(row_is_deleted->getType(), 2);
1020  row_is_deleted_or_default->addIncoming(row_is_deleted, it_valid_bb);
1021  row_is_deleted_or_default->addIncoming(row_is_deleted_default, it_not_valid_bb);
1022  return row_is_deleted_or_default;
1023  };
1024 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:303
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:1185
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

std::vector< JoinLoop > Executor::buildJoinLoops ( RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const std::vector< InputTableInfo > &  query_infos,
ColumnCacheMap column_cache 
)
private

Definition at line 610 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_LT, CodeGenerator::codegen(), INJECT_TIMER, CgenState::ir_builder_, RelAlgExecutionUnit::join_quals, LEFT, PlanState::left_join_non_hashtable_quals_, CgenState::llBool(), MultiSet, OneToOne, CgenState::outer_join_match_found_per_level_, CodeGenerator::plan_state_, Set, Singleton, JoinLoopDomain::slot_lookup_result, CodeGenerator::toBool(), and JoinLoopDomain::values_buffer.

615  {
618  std::vector<JoinLoop> join_loops;
619  for (size_t level_idx = 0, current_hash_table_idx = 0;
620  level_idx < ra_exe_unit.join_quals.size();
621  ++level_idx) {
622  const auto& current_level_join_conditions = ra_exe_unit.join_quals[level_idx];
623  std::vector<std::string> fail_reasons;
624  const auto current_level_hash_table =
625  buildCurrentLevelHashTable(current_level_join_conditions,
626  level_idx,
627  ra_exe_unit,
628  co,
629  query_infos,
630  column_cache,
631  fail_reasons);
632  const auto found_outer_join_matches_cb =
633  [this, level_idx](llvm::Value* found_outer_join_matches) {
634  CHECK_LT(level_idx, cgen_state_->outer_join_match_found_per_level_.size());
635  CHECK(!cgen_state_->outer_join_match_found_per_level_[level_idx]);
636  cgen_state_->outer_join_match_found_per_level_[level_idx] =
637  found_outer_join_matches;
638  };
639  const auto is_deleted_cb = buildIsDeletedCb(ra_exe_unit, level_idx, co);
640  auto rem_left_join_quals_it =
641  plan_state_->left_join_non_hashtable_quals_.find(level_idx);
642  bool has_remaining_left_join_quals =
643  rem_left_join_quals_it != plan_state_->left_join_non_hashtable_quals_.end() &&
644  !rem_left_join_quals_it->second.empty();
645  const auto outer_join_condition_remaining_quals_cb =
646  [this, level_idx, &co](const std::vector<llvm::Value*>& prev_iters) {
647  // when we have multiple quals for the left join in the current join level
648  // we first try to build a hashtable by using one of the possible qual,
649  // and deal with remaining quals as extra join conditions
650  FetchCacheAnchor anchor(cgen_state_.get());
651  addJoinLoopIterator(prev_iters, level_idx + 1);
652  llvm::Value* left_join_cond = cgen_state_->llBool(true);
653  CodeGenerator code_generator(this);
654  auto it = plan_state_->left_join_non_hashtable_quals_.find(level_idx);
655  if (it != plan_state_->left_join_non_hashtable_quals_.end()) {
656  for (auto expr : it->second) {
657  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
658  left_join_cond,
659  code_generator.toBool(
660  code_generator.codegen(expr.get(), true, co).front()));
661  }
662  }
663  return left_join_cond;
664  };
665  if (current_level_hash_table) {
666  const auto hoisted_filters_cb = buildHoistLeftHandSideFiltersCb(
667  ra_exe_unit, level_idx, current_level_hash_table->getInnerTableId(), co);
668  if (current_level_hash_table->getHashType() == HashType::OneToOne) {
669  join_loops.emplace_back(
670  /*kind=*/JoinLoopKind::Singleton,
671  /*type=*/current_level_join_conditions.type,
672  /*iteration_domain_codegen=*/
673  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
674  const std::vector<llvm::Value*>& prev_iters) {
675  addJoinLoopIterator(prev_iters, level_idx);
676  JoinLoopDomain domain{{0}};
677  domain.slot_lookup_result =
678  current_level_hash_table->codegenSlot(co, current_hash_table_idx);
679  return domain;
680  },
681  /*outer_condition_match=*/
682  current_level_join_conditions.type == JoinType::LEFT &&
683  has_remaining_left_join_quals
684  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
685  outer_join_condition_remaining_quals_cb)
686  : nullptr,
687  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
688  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
689  : nullptr,
690  /*hoisted_filters=*/hoisted_filters_cb,
691  /*is_deleted=*/is_deleted_cb,
692  /*nested_loop_join=*/false);
693  } else if (auto range_join_table =
694  dynamic_cast<RangeJoinHashTable*>(current_level_hash_table.get())) {
695  join_loops.emplace_back(
696  /* kind= */ JoinLoopKind::MultiSet,
697  /* type= */ current_level_join_conditions.type,
698  /* iteration_domain_codegen= */
699  [this,
700  range_join_table,
701  current_hash_table_idx,
702  level_idx,
703  current_level_hash_table,
704  &co](const std::vector<llvm::Value*>& prev_iters) {
705  addJoinLoopIterator(prev_iters, level_idx);
706  JoinLoopDomain domain{{0}};
707  CHECK(!prev_iters.empty());
708  const auto matching_set = range_join_table->codegenMatchingSetWithOffset(
709  co, current_hash_table_idx, prev_iters.back());
710  domain.values_buffer = matching_set.elements;
711  domain.element_count = matching_set.count;
712  return domain;
713  },
714  /* outer_condition_match= */
715  current_level_join_conditions.type == JoinType::LEFT
716  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
717  outer_join_condition_remaining_quals_cb)
718  : nullptr,
719  /* found_outer_matches= */
720  current_level_join_conditions.type == JoinType::LEFT
721  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
722  : nullptr,
723  /* hoisted_filters= */ nullptr, // <<! TODO
724  /* is_deleted= */ is_deleted_cb,
725  /*nested_loop_join=*/false);
726  } else {
727  join_loops.emplace_back(
728  /*kind=*/JoinLoopKind::Set,
729  /*type=*/current_level_join_conditions.type,
730  /*iteration_domain_codegen=*/
731  [this, current_hash_table_idx, level_idx, current_level_hash_table, &co](
732  const std::vector<llvm::Value*>& prev_iters) {
733  addJoinLoopIterator(prev_iters, level_idx);
734  JoinLoopDomain domain{{0}};
735  const auto matching_set = current_level_hash_table->codegenMatchingSet(
736  co, current_hash_table_idx);
737  domain.values_buffer = matching_set.elements;
738  domain.element_count = matching_set.count;
739  return domain;
740  },
741  /*outer_condition_match=*/
742  current_level_join_conditions.type == JoinType::LEFT
743  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
744  outer_join_condition_remaining_quals_cb)
745  : nullptr,
746  /*found_outer_matches=*/current_level_join_conditions.type == JoinType::LEFT
747  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
748  : nullptr,
749  /*hoisted_filters=*/hoisted_filters_cb,
750  /*is_deleted=*/is_deleted_cb,
751  /*nested_loop_join=*/false);
752  }
753  ++current_hash_table_idx;
754  } else {
755  const auto fail_reasons_str = current_level_join_conditions.quals.empty()
756  ? "No equijoin expression found"
757  : boost::algorithm::join(fail_reasons, " | ");
759  ra_exe_unit, eo, query_infos, level_idx, fail_reasons_str);
760  // Callback provided to the `JoinLoop` framework to evaluate the (outer) join
761  // condition.
762  VLOG(1) << "Unable to build hash table, falling back to loop join: "
763  << fail_reasons_str;
764  const auto outer_join_condition_cb =
765  [this, level_idx, &co, &current_level_join_conditions](
766  const std::vector<llvm::Value*>& prev_iters) {
767  // The values generated for the match path don't dominate all uses
768  // since on the non-match path nulls are generated. Reset the cache
769  // once the condition is generated to avoid incorrect reuse.
770  FetchCacheAnchor anchor(cgen_state_.get());
771  addJoinLoopIterator(prev_iters, level_idx + 1);
772  llvm::Value* left_join_cond = cgen_state_->llBool(true);
773  CodeGenerator code_generator(this);
774  for (auto expr : current_level_join_conditions.quals) {
775  left_join_cond = cgen_state_->ir_builder_.CreateAnd(
776  left_join_cond,
777  code_generator.toBool(
778  code_generator.codegen(expr.get(), true, co).front()));
779  }
780  return left_join_cond;
781  };
782  join_loops.emplace_back(
783  /*kind=*/JoinLoopKind::UpperBound,
784  /*type=*/current_level_join_conditions.type,
785  /*iteration_domain_codegen=*/
786  [this, level_idx](const std::vector<llvm::Value*>& prev_iters) {
787  addJoinLoopIterator(prev_iters, level_idx);
788  JoinLoopDomain domain{{0}};
789  auto* arg = get_arg_by_name(cgen_state_->row_func_, "num_rows_per_scan");
790  const auto rows_per_scan_ptr = cgen_state_->ir_builder_.CreateGEP(
791  arg->getType()->getScalarType()->getPointerElementType(),
792  arg,
793  cgen_state_->llInt(int32_t(level_idx + 1)));
794  domain.upper_bound = cgen_state_->ir_builder_.CreateLoad(
795  rows_per_scan_ptr->getType()->getPointerElementType(),
796  rows_per_scan_ptr,
797  "num_rows_per_scan");
798  return domain;
799  },
800  /*outer_condition_match=*/
801  current_level_join_conditions.type == JoinType::LEFT
802  ? std::function<llvm::Value*(const std::vector<llvm::Value*>&)>(
803  outer_join_condition_cb)
804  : nullptr,
805  /*found_outer_matches=*/
806  current_level_join_conditions.type == JoinType::LEFT
807  ? std::function<void(llvm::Value*)>(found_outer_join_matches_cb)
808  : nullptr,
809  /*hoisted_filters=*/nullptr,
810  /*is_deleted=*/is_deleted_cb,
811  /*nested_loop_join=*/true);
812  }
813  }
814  return join_loops;
815 }
llvm::Value * values_buffer
Definition: JoinLoop.h:49
std::string join(T const &container, std::string const &delim)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
#define INJECT_TIMER(DESC)
Definition: measure.h:96
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * slot_lookup_result
Definition: JoinLoop.h:47
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::shared_ptr< HashJoin > buildCurrentLevelHashTable(const JoinCondition &current_level_join_conditions, size_t level_idx, RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache, std::vector< std::string > &fail_reasons)
Definition: IRCodegen.cpp:1026
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:1185
#define CHECK(condition)
Definition: Logger.h:291
void check_if_loop_join_is_allowed(RelAlgExecutionUnit &ra_exe_unit, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, const size_t level_idx, const std::string &fail_reason)
Definition: IRCodegen.cpp:545
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:610
std::function< llvm::Value *(const std::vector< llvm::Value * > &, llvm::Value *)> buildIsDeletedCb(const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const CompilationOptions &co)
Definition: IRCodegen.cpp:967
JoinLoop::HoistedFiltersCallback buildHoistLeftHandSideFiltersCb(const RelAlgExecutionUnit &ra_exe_unit, const size_t level_idx, const shared::TableKey &inner_table_key, const CompilationOptions &co)
Definition: IRCodegen.cpp:858
#define VLOG(n)
Definition: Logger.h:388

+ Here is the call graph for this function:

void Executor::buildSelectedFragsMapping ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
std::vector< size_t > &  local_col_to_frag_pos,
const std::list< std::shared_ptr< const InputColDescriptor >> &  col_global_ids,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 3742 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LT, getFragmentCount(), RelAlgExecutionUnit::input_descs, and plan_state_.

Referenced by fetchChunks().

3747  {
3748  local_col_to_frag_pos.resize(plan_state_->global_to_local_col_ids_.size());
3749  size_t frag_pos{0};
3750  const auto& input_descs = ra_exe_unit.input_descs;
3751  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
3752  const auto& table_key = input_descs[scan_idx].getTableKey();
3753  CHECK_EQ(selected_fragments[scan_idx].table_key, table_key);
3754  selected_fragments_crossjoin.push_back(
3755  getFragmentCount(selected_fragments, scan_idx, ra_exe_unit));
3756  for (const auto& col_id : col_global_ids) {
3757  CHECK(col_id);
3758  const auto& input_desc = col_id->getScanDesc();
3759  if (input_desc.getTableKey() != table_key ||
3760  input_desc.getNestLevel() != static_cast<int>(scan_idx)) {
3761  continue;
3762  }
3763  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
3764  CHECK(it != plan_state_->global_to_local_col_ids_.end());
3765  CHECK_LT(static_cast<size_t>(it->second),
3766  plan_state_->global_to_local_col_ids_.size());
3767  local_col_to_frag_pos[it->second] = frag_pos;
3768  }
3769  ++frag_pos;
3770  }
3771 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::vector< InputDescriptor > input_descs
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::vector< size_t > getFragmentCount(const FragmentsList &selected_fragments, const size_t scan_idx, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:3728
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::buildSelectedFragsMappingForUnion ( std::vector< std::vector< size_t >> &  selected_fragments_crossjoin,
const FragmentsList selected_fragments,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 3773 of file Execute.cpp.

References RelAlgExecutionUnit::input_descs.

Referenced by fetchUnionChunks().

3776  {
3777  const auto& input_descs = ra_exe_unit.input_descs;
3778  for (size_t scan_idx = 0; scan_idx < input_descs.size(); ++scan_idx) {
3779  // selected_fragments is set in assignFragsToKernelDispatch execution_kernel.fragments
3780  if (selected_fragments[0].table_key == input_descs[scan_idx].getTableKey()) {
3781  selected_fragments_crossjoin.push_back({size_t(1)});
3782  }
3783  }
3784 }
std::vector< InputDescriptor > input_descs

+ Here is the caller graph for this function:

FragmentSkipStatus Executor::canSkipFragmentForFpQual ( const Analyzer::BinOper comp_expr,
const Analyzer::ColumnVar lhs_col,
const Fragmenter_Namespace::FragmentInfo fragment,
const Analyzer::Constant rhs_const 
) const
private

Definition at line 4564 of file Execute.cpp.

References CHECK, shared::ColumnKey::column_id, extract_max_stat_fp_type(), extract_min_stat_fp_type(), Analyzer::Constant::get_constval(), Analyzer::BinOper::get_optype(), SQLTypeInfo::get_type(), Analyzer::Expr::get_type_info(), Fragmenter_Namespace::FragmentInfo::getChunkMetadataMap(), Analyzer::ColumnVar::getColumnKey(), INVALID, kDOUBLE, kEQ, kFLOAT, kGE, kGT, kLE, kLT, NOT_SKIPPABLE, and SKIPPABLE.

Referenced by skipFragment().

4568  {
4569  auto col_id = lhs_col->getColumnKey().column_id;
4570  auto chunk_meta_it = fragment.getChunkMetadataMap().find(col_id);
4571  if (chunk_meta_it == fragment.getChunkMetadataMap().end()) {
4573  }
4574  double chunk_min{0.};
4575  double chunk_max{0.};
4576  const auto& chunk_type = lhs_col->get_type_info();
4577  chunk_min = extract_min_stat_fp_type(chunk_meta_it->second->chunkStats, chunk_type);
4578  chunk_max = extract_max_stat_fp_type(chunk_meta_it->second->chunkStats, chunk_type);
4579  if (chunk_min > chunk_max) {
4581  }
4582 
4583  const auto datum_fp = rhs_const->get_constval();
4584  const auto rhs_type = rhs_const->get_type_info().get_type();
4585  CHECK(rhs_type == kFLOAT || rhs_type == kDOUBLE);
4586 
4587  // Do we need to codegen the constant like the integer path does?
4588  const auto rhs_val = rhs_type == kFLOAT ? datum_fp.floatval : datum_fp.doubleval;
4589 
4590  // Todo: dedup the following comparison code with the integer/timestamp path, it is
4591  // slightly tricky due to do cleanly as we do not have rowid on this path
4592  switch (comp_expr->get_optype()) {
4593  case kGE:
4594  if (chunk_max < rhs_val) {
4596  }
4597  break;
4598  case kGT:
4599  if (chunk_max <= rhs_val) {
4601  }
4602  break;
4603  case kLE:
4604  if (chunk_min > rhs_val) {
4606  }
4607  break;
4608  case kLT:
4609  if (chunk_min >= rhs_val) {
4611  }
4612  break;
4613  case kEQ:
4614  if (chunk_min > rhs_val || chunk_max < rhs_val) {
4616  }
4617  break;
4618  default:
4619  break;
4620  }
4622 }
double extract_max_stat_fp_type(const ChunkStats &stats, const SQLTypeInfo &ti)
Definition: sqldefs.h:34
Definition: sqldefs.h:35
Definition: sqldefs.h:29
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
SQLOps get_optype() const
Definition: Analyzer.h:452
double extract_min_stat_fp_type(const ChunkStats &stats, const SQLTypeInfo &ti)
const ChunkMetadataMap & getChunkMetadataMap() const
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
Definition: sqldefs.h:33
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198
Datum get_constval() const
Definition: Analyzer.h:348
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqldefs.h:32

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * Executor::castToFP ( llvm::Value *  value,
SQLTypeInfo const &  from_ti,
SQLTypeInfo const &  to_ti 
)
private

Definition at line 4367 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, cgen_state_, exp_to_scale(), logger::FATAL, SQLTypeInfo::get_scale(), SQLTypeInfo::get_size(), SQLTypeInfo::is_fp(), SQLTypeInfo::is_number(), and LOG.

4369  {
4371  if (value->getType()->isIntegerTy() && from_ti.is_number() && to_ti.is_fp() &&
4372  (!from_ti.is_fp() || from_ti.get_size() != to_ti.get_size())) {
4373  llvm::Type* fp_type{nullptr};
4374  switch (to_ti.get_size()) {
4375  case 4:
4376  fp_type = llvm::Type::getFloatTy(cgen_state_->context_);
4377  break;
4378  case 8:
4379  fp_type = llvm::Type::getDoubleTy(cgen_state_->context_);
4380  break;
4381  default:
4382  LOG(FATAL) << "Unsupported FP size: " << to_ti.get_size();
4383  }
4384  value = cgen_state_->ir_builder_.CreateSIToFP(value, fp_type);
4385  if (from_ti.get_scale()) {
4386  value = cgen_state_->ir_builder_.CreateFDiv(
4387  value,
4388  llvm::ConstantFP::get(value->getType(), exp_to_scale(from_ti.get_scale())));
4389  }
4390  }
4391  return value;
4392 }
#define LOG(tag)
Definition: Logger.h:285
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
#define AUTOMATIC_IR_METADATA(CGENSTATE)
uint64_t exp_to_scale(const unsigned exp)

+ Here is the call graph for this function:

llvm::Value * Executor::castToIntPtrTyIn ( llvm::Value *  val,
const size_t  bit_width 
)
private

Definition at line 4394 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, cgen_state_, CHECK, CHECK_LT, and get_int_type().

4394  {
4396  CHECK(val->getType()->isPointerTy());
4397 
4398  const auto val_ptr_type = static_cast<llvm::PointerType*>(val->getType());
4399  const auto val_type = val_ptr_type->getPointerElementType();
4400  size_t val_width = 0;
4401  if (val_type->isIntegerTy()) {
4402  val_width = val_type->getIntegerBitWidth();
4403  } else {
4404  if (val_type->isFloatTy()) {
4405  val_width = 32;
4406  } else {
4407  CHECK(val_type->isDoubleTy());
4408  val_width = 64;
4409  }
4410  }
4411  CHECK_LT(size_t(0), val_width);
4412  if (bitWidth == val_width) {
4413  return val;
4414  }
4415  return cgen_state_->ir_builder_.CreateBitCast(
4416  val, llvm::PointerType::get(get_int_type(bitWidth, cgen_state_->context_), 0));
4417 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

bool Executor::checkCurrentQuerySession ( const std::string &  candidate_query_session,
heavyai::shared_lock< heavyai::shared_mutex > &  read_lock 
)

Definition at line 4957 of file Execute.cpp.

References current_query_session_.

4959  {
4960  // if current_query_session is equal to the candidate_query_session,
4961  // or it is empty session we consider
4962  return !candidate_query_session.empty() &&
4963  (current_query_session_ == candidate_query_session);
4964 }
QuerySessionId current_query_session_
Definition: Execute.h:1576
bool Executor::checkIsQuerySessionEnrolled ( const QuerySessionId query_session,
heavyai::shared_lock< heavyai::shared_mutex > &  read_lock 
)

Definition at line 5231 of file Execute.cpp.

References queries_session_map_.

Referenced by executeWorkUnitImpl().

5233  {
5234  if (query_session.empty()) {
5235  return false;
5236  }
5237  return !query_session.empty() && queries_session_map_.count(query_session);
5238 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580

+ Here is the caller graph for this function:

bool Executor::checkIsQuerySessionInterrupted ( const std::string &  query_session,
heavyai::shared_lock< heavyai::shared_mutex > &  read_lock 
)

Definition at line 5220 of file Execute.cpp.

References queries_interrupt_flag_.

Referenced by executePlanWithGroupBy(), executePlanWithoutGroupBy(), fetchChunks(), and fetchUnionChunks().

5222  {
5223  if (query_session.empty()) {
5224  return false;
5225  }
5226  auto flag_it = queries_interrupt_flag_.find(query_session);
5227  return !query_session.empty() && flag_it != queries_interrupt_flag_.end() &&
5228  flag_it->second;
5229 }
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1578

+ Here is the caller graph for this function:

bool Executor::checkNonKernelTimeInterrupted ( ) const

Definition at line 5329 of file Execute.cpp.

References current_query_session_, executor_id_, executor_session_mutex_, queries_interrupt_flag_, and UNITARY_EXECUTOR_ID.

5329  {
5330  // this function should be called within an executor which is assigned
5331  // to the specific query thread (that indicates we already enroll the session)
5332  // check whether this is called from non unitary executor
5334  return false;
5335  };
5337  auto flag_it = queries_interrupt_flag_.find(current_query_session_);
5338  return !current_query_session_.empty() && flag_it != queries_interrupt_flag_.end() &&
5339  flag_it->second;
5340 }
QuerySessionId current_query_session_
Definition: Execute.h:1576
std::shared_lock< T > shared_lock
const ExecutorId executor_id_
Definition: Execute.h:1476
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1578
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
static constexpr ExecutorId UNITARY_EXECUTOR_ID
Definition: Execute.h:423
void Executor::checkPendingQueryStatus ( const QuerySessionId query_session)

Definition at line 5001 of file Execute.cpp.

References ERR_INTERRUPTED, executor_session_mutex_, queries_interrupt_flag_, queries_session_map_, and VLOG.

5001  {
5002  // check whether we are okay to execute the "pending" query
5003  // i.e., before running the query check if this query session is "ALREADY" interrupted
5005  if (query_session.empty()) {
5006  return;
5007  }
5008  if (queries_interrupt_flag_.find(query_session) == queries_interrupt_flag_.end()) {
5009  // something goes wrong since we assume this is caller's responsibility
5010  // (call this function only for enrolled query session)
5011  if (!queries_session_map_.count(query_session)) {
5012  VLOG(1) << "Interrupting pending query is not available since the query session is "
5013  "not enrolled";
5014  } else {
5015  // here the query session is enrolled but the interrupt flag is not registered
5016  VLOG(1)
5017  << "Interrupting pending query is not available since its interrupt flag is "
5018  "not registered";
5019  }
5020  return;
5021  }
5022  if (queries_interrupt_flag_[query_session]) {
5024  }
5025 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
std::shared_lock< T > shared_lock
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1578
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
#define VLOG(n)
Definition: Logger.h:388
void Executor::clearCaches ( bool  runtime_only = false)
void Executor::clearCardinalityCache ( )
static

Definition at line 5275 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, and recycler_mutex_.

Referenced by clearExternalCaches().

5275  {
5278  cardinality_cache_.clear();
5279  }
5280 }
std::unique_lock< T > unique_lock
static std::unordered_map< CardinalityCacheKey, size_t > cardinality_cache_
Definition: Execute.h:1607
static heavyai::shared_mutex recycler_mutex_
Definition: Execute.h:1605
bool g_use_estimator_result_cache
Definition: Execute.cpp:135

+ Here is the caller graph for this function:

static void Executor::clearExternalCaches ( bool  for_update,
const TableDescriptor td,
const int  current_db_id 
)
inlinestatic

Definition at line 438 of file Execute.h.

References clearCardinalityCache(), TableDescriptor::getTableChunkKey(), hash_value(), CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCaches(), CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCachesByTable(), invalidateCardinalityCacheForTable(), and TableDescriptor::tableId.

Referenced by AlterTableAlterColumnCommand::clearInMemoryData(), clearMemory(), DropForeignTableCommand::execute(), Parser::InsertIntoTableAsSelectStmt::execute(), Parser::DropTableStmt::execute(), Parser::TruncateTableStmt::execute(), Parser::OptimizeTableStmt::execute(), Parser::AddColumnStmt::execute(), Parser::DropColumnStmt::execute(), Parser::AlterTableParamStmt::execute(), Parser::CopyTableStmt::execute(), RelAlgExecutor::executeDelete(), RelAlgExecutor::executeSimpleInsert(), RelAlgExecutor::executeUpdate(), Catalog_Namespace::Catalog::invalidateCachesForTable(), foreign_storage::refresh_foreign_table_unlocked(), DBHandler::set_table_epochs(), Catalog_Namespace::Catalog::setUncappedTableEpoch(), and DBHandler::shutdown().

440  {
441  bool clearEntireCache = true;
442  if (td) {
443  const auto& table_chunk_key_prefix = td->getTableChunkKey(current_db_id);
444  if (!table_chunk_key_prefix.empty()) {
445  auto table_key = boost::hash_value(table_chunk_key_prefix);
447  if (for_update) {
449  } else {
451  }
453  clearEntireCache = false;
454  }
455  }
456  if (clearEntireCache) {
458  if (for_update) {
460  } else {
462  }
464  }
465  }
static void invalidateCachesByTable(size_t table_key)
static void invalidateCaches()
static void clearCardinalityCache()
Definition: Execute.cpp:5275
static void invalidateCardinalityCacheForTable(const shared::TableKey &table_key)
Definition: Execute.cpp:5282
std::size_t hash_value(RexAbstractInput const &rex_ab_input)
Definition: RelAlgDag.cpp:3525
std::vector< int > getTableChunkKey(const int getCurrentDBId) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::clearMemory ( const Data_Namespace::MemoryLevel  memory_level)
static

Definition at line 531 of file Execute.cpp.

References clearExternalCaches(), Data_Namespace::DataMgr::clearMemory(), Data_Namespace::CPU_LEVEL, execute_mutex_, Catalog_Namespace::SysCatalog::getDataMgr(), Data_Namespace::GPU_LEVEL, Catalog_Namespace::SysCatalog::instance(), and CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCaches().

Referenced by DBHandler::clear_cpu_memory(), DBHandler::clear_gpu_memory(), QueryRunner::QueryRunner::clearCpuMemory(), and QueryRunner::QueryRunner::clearGpuMemory().

531  {
532  switch (memory_level) {
536  execute_mutex_); // Don't flush memory while queries are running
537 
538  if (memory_level == Data_Namespace::MemoryLevel::CPU_LEVEL) {
539  // The hash table cache uses CPU memory not managed by the buffer manager. In the
540  // future, we should manage these allocations with the buffer manager directly.
541  // For now, assume the user wants to purge the hash table cache when they clear
542  // CPU memory (currently used in ExecuteTest to lower memory pressure)
543  // TODO: Move JoinHashTableCacheInvalidator to Executor::clearExternalCaches();
545  }
546  Executor::clearExternalCaches(true, nullptr, 0);
548  break;
549  }
550  default: {
551  throw std::runtime_error(
552  "Clearing memory levels other than the CPU level or GPU level is not "
553  "supported.");
554  }
555  }
556 }
static heavyai::shared_mutex execute_mutex_
Definition: Execute.h:1585
void clearMemory(const MemoryLevel memLevel)
Definition: DataMgr.cpp:465
static void invalidateCaches()
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:234
static SysCatalog & instance()
Definition: SysCatalog.h:343
std::unique_lock< T > unique_lock
static void clearExternalCaches(bool for_update, const TableDescriptor *td, const int current_db_id)
Definition: Execute.h:438

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::clearMetaInfoCache ( )
private

Definition at line 1029 of file Execute.cpp.

References agg_col_range_cache_, TableGenerations::clear(), AggregatedColRange::clear(), InputTableInfoCache::clear(), input_table_info_cache_, and table_generations_.

1029  {
1033 }
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:1572
InputTableInfoCache input_table_info_cache_
Definition: Execute.h:1571
TableGenerations table_generations_
Definition: Execute.h:1573

+ Here is the call graph for this function:

void Executor::clearQuerySessionStatus ( const QuerySessionId query_session,
const std::string &  submitted_time_str 
)

Definition at line 5027 of file Execute.cpp.

References current_query_session_, executor_session_mutex_, invalidateRunningQuerySession(), removeFromQuerySessionList(), and resetInterrupt().

5028  {
5030  // clear the interrupt-related info for a finished query
5031  if (query_session.empty()) {
5032  return;
5033  }
5034  removeFromQuerySessionList(query_session, submitted_time_str, session_write_lock);
5035  if (query_session.compare(current_query_session_) == 0) {
5036  invalidateRunningQuerySession(session_write_lock);
5037  resetInterrupt();
5038  }
5039 }
QuerySessionId current_query_session_
Definition: Execute.h:1576
bool removeFromQuerySessionList(const QuerySessionId &query_session, const std::string &submitted_time_str, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
Definition: Execute.cpp:5175
std::unique_lock< T > unique_lock
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
void resetInterrupt()
void invalidateRunningQuerySession(heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
Definition: Execute.cpp:4979

+ Here is the call graph for this function:

llvm::Value * Executor::codegenAggregateWindowState ( CodeGenerator code_generator,
const CompilationOptions co,
llvm::Value *  aggregate_state 
)
private

Definition at line 1500 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, COUNT, CodegenUtil::createPtrWithHoistedMemoryAddr(), anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), Analyzer::WindowFunction::getKind(), kDECIMAL, kDOUBLE, kFLOAT, and WindowFunctionContext::NUM_EXECUTION_DEVICES.

1502  {
1504  const auto pi32_type =
1505  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
1506  const auto pi64_type =
1507  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
1508  const auto window_func_context =
1510  const Analyzer::WindowFunction* window_func = window_func_context->getWindowFunction();
1511  const auto window_func_ti = get_adjusted_window_type_info(window_func);
1512  const auto aggregate_state_type =
1513  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
1514  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
1515  const auto aggregate_state_count_i64 = cgen_state_->llInt(
1516  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
1517  auto aggregate_state_count = CodegenUtil::createPtrWithHoistedMemoryAddr(
1518  cgen_state_.get(),
1519  code_generator,
1520  co,
1521  aggregate_state_count_i64,
1522  aggregate_state_type,
1524  .front();
1525  const auto double_null_lv = cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE));
1526  switch (window_func_ti.get_type()) {
1527  case kFLOAT: {
1528  return cgen_state_->emitCall(
1529  "load_avg_float", {aggregate_state, aggregate_state_count, double_null_lv});
1530  }
1531  case kDOUBLE: {
1532  return cgen_state_->emitCall(
1533  "load_avg_double", {aggregate_state, aggregate_state_count, double_null_lv});
1534  }
1535  case kDECIMAL: {
1536  return cgen_state_->emitCall(
1537  "load_avg_decimal",
1538  {aggregate_state,
1539  aggregate_state_count,
1540  double_null_lv,
1541  cgen_state_->llInt<int32_t>(window_func_ti.get_scale())});
1542  }
1543  default: {
1544  return cgen_state_->emitCall(
1545  "load_avg_int", {aggregate_state, aggregate_state_count, double_null_lv});
1546  }
1547  }
1548  }
1549  if (window_func->getKind() == SqlWindowFunctionKind::COUNT) {
1550  return cgen_state_->ir_builder_.CreateLoad(
1551  aggregate_state->getType()->getPointerElementType(), aggregate_state);
1552  }
1553  switch (window_func_ti.get_type()) {
1554  case kFLOAT: {
1555  return cgen_state_->emitCall("load_float", {aggregate_state});
1556  }
1557  case kDOUBLE: {
1558  return cgen_state_->emitCall("load_double", {aggregate_state});
1559  }
1560  default: {
1561  return cgen_state_->ir_builder_.CreateLoad(
1562  aggregate_state->getType()->getPointerElementType(), aggregate_state);
1563  }
1564  }
1565 }
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2794
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static const int NUM_EXECUTION_DEVICES
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

llvm::Value * Executor::codegenConditionalAggregateCondValSelector ( llvm::Value *  cond_lv,
SQLAgg const  aggKind,
CompilationOptions const &  co 
) const
private

Definition at line 1567 of file WindowFunctionIR.cpp.

References CHECK, and kSUM_IF.

1570  {
1571  llvm::Value* res_cond_lv{nullptr};
1572  switch (aggKind) {
1573  case kSUM_IF:
1574  if (cond_lv->getType()->isIntegerTy(1)) {
1575  // cond_expr returns i1 type val, just need to cast to i8 type
1576  // i.e., cond_expr IS NULL
1577  res_cond_lv = cgen_state_->castToTypeIn(cond_lv, 8);
1578  } else {
1579  CHECK(cond_lv->getType()->isIntegerTy(8));
1580  // cond_expr may have null value instead of upcasted bool (i1-type) value
1581  // so we have to correctly set true condition
1582  // i.e., i8 @gt_int32_t_nullable_lhs(..., i64 -2147483648, i8 -128)
1583  // has one of the following i8-type values: 1, 0, -128
1584  auto true_cond_lv =
1585  cgen_state_->ir_builder_.CreateICmpEQ(cond_lv, cgen_state_->llInt((int8_t)1));
1586  res_cond_lv = cgen_state_->ir_builder_.CreateSelect(
1587  true_cond_lv, cgen_state_->llInt((int8_t)1), cgen_state_->llInt((int8_t)0));
1588  }
1589  break;
1590  default:
1591  break;
1592  }
1593  return res_cond_lv;
1594 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
#define CHECK(condition)
Definition: Logger.h:291
llvm::Value * Executor::codegenCurrentPartitionIndex ( const WindowFunctionContext window_func_context,
CodeGenerator code_generator,
const CompilationOptions co,
llvm::Value *  current_row_pos_lv 
)
private

Definition at line 771 of file WindowFunctionIR.cpp.

References CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowFunctionContext::elementCount(), get_int_type(), WindowFunctionContext::getWindowFunction(), Analyzer::WindowFunction::isFrameNavigateWindowFunction(), WindowFunctionContext::NUM_EXECUTION_DEVICES, WindowFunctionContext::partitionCount(), WindowFunctionContext::partitionNumCountBuf(), and WindowFunctionContext::payload().

775  {
776  const auto pi64_type =
777  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
778  const auto pi32_type =
779  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
780  auto row_pos_lv = current_row_pos_lv;
781  if (window_func_context->getWindowFunction()->isFrameNavigateWindowFunction()) {
782  // `current_row_pos_lv` indicates the index of the current row, but to figure out
783  // it's index of window partition it belongs to, we need a special approach
784  // especially for window framing navigation function for instance, when we have
785  // five rows having two columns pc and val such as (2,1), (2,2), (2,3), (1,1),
786  // (1,2), we build a OneToMany Perfect Hash Table as: offset: 0 2 / count: 2 3 /
787  // payload: i1, i2, i3, i4, i5 where i1 ~ i3 and i4 ~ i5 are rows for partition 1
788  // (i.e., pc = 1) and 2 (i.e., prc = 2), respectively. But when processing the first
789  // row (2, 1), the original `current_row_pos_lv` stands for zero so computing which
790  // partitions it belongs to is hard unless hashing the value at runtime. Even if we
791  // do hash, we cannot know the exact hash slot unless we do binary + linear searches
792  // multiple times (via payload buffer and the ordered payload buffer) i.e., when the
793  // row (1,2) is assigned to the partition[4], we cannot find the hash slot index '4'
794  // by using `current_row_pos_lv` unless doing a costly operation like a linear
795  // search over the entire window partition Instead, we collect a hash slot that each
796  // row is assigned to and keep this info at the payload buffer
797  // `hash_slot_idx_ptr_lv` and use it for computing window frame navigation functions
798  auto* const hash_slot_idx_ptr =
799  window_func_context->payload() + window_func_context->elementCount();
800  auto hash_slot_idx_buf_lv =
801  cgen_state_->llInt(reinterpret_cast<int64_t>(hash_slot_idx_ptr));
802  auto hash_slot_idx_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
803  cgen_state_.get(),
804  code_generator,
805  co,
806  hash_slot_idx_buf_lv,
807  pi32_type,
809  .front();
810  auto hash_slot_idx_load_lv = cgen_state_->ir_builder_.CreateGEP(
811  hash_slot_idx_ptr_lv->getType()->getPointerElementType(),
812  hash_slot_idx_ptr_lv,
813  current_row_pos_lv);
814  row_pos_lv = cgen_state_->castToTypeIn(
815  cgen_state_->ir_builder_.CreateLoad(
816  hash_slot_idx_load_lv->getType()->getPointerElementType(),
817  hash_slot_idx_load_lv,
818  "cur_row_hash_slot_idx"),
819  64);
820  }
821  auto partition_count_lv = cgen_state_->llInt(window_func_context->partitionCount());
822  auto partition_num_count_buf_lv = cgen_state_->llInt(
823  reinterpret_cast<int64_t>(window_func_context->partitionNumCountBuf()));
824  auto partition_num_count_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
825  cgen_state_.get(),
826  code_generator,
827  co,
828  partition_num_count_buf_lv,
829  pi64_type,
831  .front();
832  return cgen_state_->emitCall(
833  "compute_int64_t_lower_bound",
834  {partition_count_lv, row_pos_lv, partition_num_count_ptr_lv});
835 }
bool isFrameNavigateWindowFunction() const
Definition: Analyzer.h:2848
size_t elementCount() const
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
size_t partitionCount() const
static const int NUM_EXECUTION_DEVICES
const int64_t * partitionNumCountBuf() const
const Analyzer::WindowFunction * getWindowFunction() const
const int32_t * payload() const
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)

+ Here is the call graph for this function:

llvm::Value * Executor::codegenFrameBound ( bool  for_start_bound,
bool  for_range_mode,
bool  for_window_frame_naviation,
const Analyzer::WindowFrame frame_bound,
bool  is_timestamp_type_frame,
llvm::Value *  order_key_null_val,
const WindowFrameBoundFuncArgs args 
)
private

Definition at line 639 of file WindowFunctionIR.cpp.

References CHECK, CURRENT_ROW, WindowFrameBoundFuncArgs::current_row_pos_lv, EXPR_FOLLOWING, EXPR_PRECEDING, WindowFrameBoundFuncArgs::frame_end_bound_expr_lv, WindowFrameBoundFuncArgs::frame_start_bound_expr_lv, Analyzer::WindowFrame::getBoundType(), WindowFrameBoundFuncArgs::int64_t_one_val_lv, WindowFrameBoundFuncArgs::int64_t_zero_val_lv, WindowFrameBoundFuncArgs::num_elem_current_partition_lv, WindowFrameBoundFuncArgs::order_type_col_name, UNBOUNDED_FOLLOWING, and UNBOUNDED_PRECEDING.

645  {
646  const auto bound_type = frame_bound->getBoundType();
647  auto adjust_frame_end_bound = [&](llvm::Value* target_bound_lv) {
648  return cgen_state_->ir_builder_.CreateSub(target_bound_lv, args.int64_t_one_val_lv);
649  };
651  CHECK(for_start_bound) << "frame end cannot be UNBOUNDED PRECEDING";
652  return args.int64_t_zero_val_lv;
653  } else if (bound_type == SqlWindowFrameBoundType::UNBOUNDED_FOLLOWING) {
654  CHECK(!for_start_bound) << "frame start cannot be UNBOUNDED FOLLOWING";
655  // adjust frame bound w.r.t the open frame interval if necessary
656  return for_window_frame_naviation
657  ? adjust_frame_end_bound(args.num_elem_current_partition_lv)
658  : args.num_elem_current_partition_lv;
659  }
660  std::vector<llvm::Value*> func_args;
661  std::string op_name =
662  bound_type == SqlWindowFrameBoundType::EXPR_FOLLOWING ? "add" : "sub";
663  if (!for_range_mode) {
664  llvm::Value* current_row_bound_expr_lv{nullptr};
665  if (for_window_frame_naviation) {
666  // we already know a current row's index in (ordered) window frame in this case
667  auto bound_expr =
668  for_start_bound ? args.frame_start_bound_expr_lv : args.frame_end_bound_expr_lv;
669  if (bound_type == SqlWindowFrameBoundType::EXPR_FOLLOWING) {
670  current_row_bound_expr_lv =
671  cgen_state_->ir_builder_.CreateAdd(args.current_row_pos_lv, bound_expr);
672  } else if (bound_type == SqlWindowFrameBoundType::EXPR_PRECEDING) {
673  current_row_bound_expr_lv =
674  cgen_state_->ir_builder_.CreateSub(args.current_row_pos_lv, bound_expr);
675  } else {
677  current_row_bound_expr_lv = args.current_row_pos_lv;
678  }
679  // adjust frame bound w.r.t the open frame interval
680  if (for_start_bound) {
681  return cgen_state_->ir_builder_.CreateSelect(
682  cgen_state_->ir_builder_.CreateICmpSLT(current_row_bound_expr_lv,
683  args.int64_t_zero_val_lv),
684  args.int64_t_zero_val_lv,
685  current_row_bound_expr_lv);
686  } else {
687  return cgen_state_->ir_builder_.CreateSelect(
688  cgen_state_->ir_builder_.CreateICmpSGE(current_row_bound_expr_lv,
690  adjust_frame_end_bound(args.num_elem_current_partition_lv),
691  current_row_bound_expr_lv);
692  }
693  } else {
694  std::string func_class = for_start_bound ? "start" : "end";
695  auto const func_name = "compute_row_mode_" + func_class + "_index_" + op_name;
696  func_args = prepareRowModeFuncArgs(for_start_bound, bound_type, args);
697  current_row_bound_expr_lv = cgen_state_->emitCall(func_name, func_args);
698  }
699  return current_row_bound_expr_lv;
700  } else {
701  std::string func_class = for_start_bound ? "lower" : "upper";
702  auto const func_name = getFramingFuncName(
703  func_class,
704  args.order_type_col_name,
705  op_name,
706  bound_type != SqlWindowFrameBoundType::CURRENT_ROW && is_timestamp_type_frame);
707  func_args = prepareRangeModeFuncArgs(
708  for_start_bound, frame_bound, is_timestamp_type_frame, order_key_null_val, args);
709  auto frame_bound_lv = cgen_state_->emitCall(func_name, func_args);
710  if (!for_start_bound && for_window_frame_naviation) {
711  // adjust frame end bound w.r.t the open frame interval
712  frame_bound_lv = cgen_state_->ir_builder_.CreateSelect(
713  cgen_state_->ir_builder_.CreateICmpSGE(frame_bound_lv,
715  adjust_frame_end_bound(args.num_elem_current_partition_lv),
716  frame_bound_lv);
717  }
718  return frame_bound_lv;
719  }
720 }
llvm::Value * num_elem_current_partition_lv
Definition: WindowContext.h:95
llvm::Value * current_row_pos_lv
Definition: WindowContext.h:90
llvm::Value * frame_end_bound_expr_lv
Definition: WindowContext.h:89
std::string getFramingFuncName(const std::string &bound_type, const std::string &order_col_type, const std::string &op_type, bool for_timestamp_type) const
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
SqlWindowFrameBoundType getBoundType() const
Definition: Analyzer.h:2695
std::vector< llvm::Value * > prepareRangeModeFuncArgs(bool for_start_bound, const Analyzer::WindowFrame *frame_bound, bool is_timestamp_type_frame, llvm::Value *order_key_null_val, const WindowFrameBoundFuncArgs &frame_args) const
#define CHECK(condition)
Definition: Logger.h:291
llvm::Value * int64_t_zero_val_lv
Definition: WindowContext.h:93
llvm::Value * int64_t_one_val_lv
Definition: WindowContext.h:94
llvm::Value * frame_start_bound_expr_lv
Definition: WindowContext.h:88
std::string order_type_col_name
Definition: WindowContext.h:97
std::vector< llvm::Value * > prepareRowModeFuncArgs(bool for_start_bound, SqlWindowFrameBoundType bound_type, const WindowFrameBoundFuncArgs &args) const

+ Here is the call graph for this function:

llvm::Value * Executor::codegenFrameBoundExpr ( const Analyzer::WindowFunction window_func,
const Analyzer::WindowFrame frame_bound,
CodeGenerator code_generator,
const CompilationOptions co 
)
private

Definition at line 588 of file WindowFunctionIR.cpp.

References CHECK, CodeGenerator::codegen(), EXPR_FOLLOWING, EXPR_PRECEDING, g_cluster, SQLTypeInfo::get_size(), Analyzer::Expr::get_type_info(), Analyzer::WindowFrame::getBoundExpr(), Analyzer::WindowFunction::getOrderKeys(), Analyzer::WindowFunction::hasRangeModeFraming(), kBIGINT, kINT, and kSMALLINT.

591  {
592  auto needs_bound_expr_codegen = [](const Analyzer::WindowFrame* window_frame) {
593  return window_frame->getBoundType() == SqlWindowFrameBoundType::EXPR_FOLLOWING ||
594  window_frame->getBoundType() == SqlWindowFrameBoundType::EXPR_PRECEDING;
595  };
596  const auto order_col_ti = window_func->getOrderKeys().front()->get_type_info();
597  auto encode_date_col_val = [&order_col_ti, this](llvm::Value* bound_expr_lv) {
598  if (order_col_ti.get_comp_param() == 16) {
599  return cgen_state_->emitCall(
600  "fixed_width_date_encode_noinline",
601  {bound_expr_lv,
602  cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(SQLTypeInfo(kSMALLINT)),
603  32),
604  cgen_state_->inlineIntNull(SQLTypeInfo(kBIGINT))});
605  } else {
606  return cgen_state_->emitCall("fixed_width_date_encode_noinline",
607  {bound_expr_lv,
608  cgen_state_->inlineIntNull(SQLTypeInfo(kINT)),
609  cgen_state_->inlineIntNull(SQLTypeInfo(kBIGINT))});
610  }
611  };
612  llvm::Value* bound_expr_lv{nullptr};
613  if (needs_bound_expr_codegen(frame_bound)) {
614  auto bound_expr = frame_bound->getBoundExpr();
615  if (auto dateadd_expr = dynamic_cast<const Analyzer::DateaddExpr*>(bound_expr)) {
616  if (dateadd_expr->get_datetime_expr()->get_type_info().is_encoded_timestamp()) {
617  dateadd_expr->set_fixed_encoding_null_val();
618  }
619  }
620  auto bound_expr_lvs = code_generator.codegen(bound_expr, true, co);
621  bound_expr_lv = bound_expr_lvs.front();
622  if (order_col_ti.is_date() && window_func->hasRangeModeFraming()) {
623  if (g_cluster) {
624  throw std::runtime_error(
625  "Range mode with date type ordering column is not supported yet.");
626  }
627  bound_expr_lv = encode_date_col_val(bound_expr_lv);
628  }
629  if (frame_bound->getBoundExpr()->get_type_info().get_size() != 8) {
630  bound_expr_lv = cgen_state_->castToTypeIn(bound_expr_lv, 64);
631  }
632  } else {
633  bound_expr_lv = cgen_state_->llInt((int64_t)-1);
634  }
635  CHECK(bound_expr_lv);
636  return bound_expr_lv;
637 }
bool hasRangeModeFraming() const
Definition: Analyzer.h:2828
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
const std::vector< std::shared_ptr< Analyzer::Expr > > & getOrderKeys() const
Definition: Analyzer.h:2802
const Analyzer::Expr * getBoundExpr() const
Definition: Analyzer.h:2697
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
#define CHECK(condition)
Definition: Logger.h:291
bool g_cluster
Definition: sqltypes.h:72

+ Here is the call graph for this function:

std::pair< llvm::Value *, llvm::Value * > Executor::codegenFrameBoundRange ( const Analyzer::WindowFunction window_func,
CodeGenerator code_generator,
const CompilationOptions co 
)
private

Definition at line 1055 of file WindowFunctionIR.cpp.

References CHECK, Analyzer::WindowFunction::getFrameEndBound(), and Analyzer::WindowFunction::getFrameStartBound().

1058  {
1059  const auto frame_start_bound = window_func->getFrameStartBound();
1060  const auto frame_end_bound = window_func->getFrameEndBound();
1061  auto frame_start_bound_expr_lv =
1062  codegenFrameBoundExpr(window_func, frame_start_bound, code_generator, co);
1063  auto frame_end_bound_expr_lv =
1064  codegenFrameBoundExpr(window_func, frame_end_bound, code_generator, co);
1065  CHECK(frame_start_bound_expr_lv);
1066  CHECK(frame_end_bound_expr_lv);
1067  return std::make_pair(frame_start_bound_expr_lv, frame_end_bound_expr_lv);
1068 }
const Analyzer::WindowFrame * getFrameStartBound() const
Definition: Analyzer.h:2806
const Analyzer::WindowFrame * getFrameEndBound() const
Definition: Analyzer.h:2813
llvm::Value * codegenFrameBoundExpr(const Analyzer::WindowFunction *window_func, const Analyzer::WindowFrame *frame_bound, CodeGenerator &code_generator, const CompilationOptions &co)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

std::pair< llvm::Value *, llvm::Value * > Executor::codegenFrameNullRange ( WindowFunctionContext window_func_context,
CodeGenerator code_generator,
const CompilationOptions co,
llvm::Value *  partition_index_lv 
) const
private

Definition at line 894 of file WindowFunctionIR.cpp.

References CodegenUtil::createPtrWithHoistedMemoryAddr(), get_int_type(), WindowFunctionContext::getNullValueEndPos(), WindowFunctionContext::getNullValueStartPos(), and WindowFunctionContext::NUM_EXECUTION_DEVICES.

898  {
899  const auto pi64_type =
900  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
901  const auto null_start_pos_buf = cgen_state_->llInt(
902  reinterpret_cast<int64_t>(window_func_context->getNullValueStartPos()));
903  const auto null_start_pos_buf_ptr = CodegenUtil::createPtrWithHoistedMemoryAddr(
904  cgen_state_.get(),
905  code_generator,
906  co,
907  null_start_pos_buf,
908  pi64_type,
910  .front();
911  const auto null_start_pos_ptr =
912  cgen_state_->ir_builder_.CreateGEP(get_int_type(64, cgen_state_->context_),
913  null_start_pos_buf_ptr,
914  partition_index_lv);
915  auto null_start_pos_lv = cgen_state_->ir_builder_.CreateLoad(
916  null_start_pos_ptr->getType()->getPointerElementType(),
917  null_start_pos_ptr,
918  "null_start_pos");
919  const auto null_end_pos_buf = cgen_state_->llInt(
920  reinterpret_cast<int64_t>(window_func_context->getNullValueEndPos()));
921  const auto null_end_pos_buf_ptr = CodegenUtil::createPtrWithHoistedMemoryAddr(
922  cgen_state_.get(),
923  code_generator,
924  co,
925  null_end_pos_buf,
926  pi64_type,
928  .front();
929  const auto null_end_pos_ptr = cgen_state_->ir_builder_.CreateGEP(
930  get_int_type(64, cgen_state_->context_), null_end_pos_buf_ptr, partition_index_lv);
931  auto null_end_pos_lv = cgen_state_->ir_builder_.CreateLoad(
932  null_end_pos_ptr->getType()->getPointerElementType(),
933  null_end_pos_ptr,
934  "null_end_pos");
935  return std::make_pair(null_start_pos_lv, null_end_pos_lv);
936 }
int64_t * getNullValueEndPos() const
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
int64_t * getNullValueStartPos() const
static const int NUM_EXECUTION_DEVICES
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)

+ Here is the call graph for this function:

void Executor::codegenJoinLoops ( const std::vector< JoinLoop > &  join_loops,
const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
llvm::Function *  query_func,
llvm::BasicBlock *  entry_bb,
QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const ExecutionOptions eo 
)
private

Definition at line 1203 of file IRCodegen.cpp.

References ExecutionOptions::allow_runtime_query_interrupt, anonymous_namespace{QueryMemoryDescriptor.cpp}::any_of(), AUTOMATIC_IR_METADATA, CHECK, CHECK_EQ, JoinLoop::codegen(), CompilationOptions::device_type, JoinLoopDomain::element_count, get_int_array_type(), get_int_type(), INNER, MultiSet, CodeGenerator::posArg(), GroupByAndAggregate::query_infos_, query_mem_desc, Set, and ExecutionOptions::with_dynamic_watchdog.

1210  {
1212  const auto exit_bb =
1213  llvm::BasicBlock::Create(cgen_state_->context_, "exit", cgen_state_->current_func_);
1214  cgen_state_->ir_builder_.SetInsertPoint(exit_bb);
1215  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
1216  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
1217  CodeGenerator code_generator(this);
1218 
1219  llvm::BasicBlock* loops_entry_bb{nullptr};
1220  auto has_range_join =
1221  std::any_of(join_loops.begin(), join_loops.end(), [](const auto& join_loop) {
1222  return join_loop.kind() == JoinLoopKind::MultiSet;
1223  });
1224  if (has_range_join) {
1225  CHECK_EQ(join_loops.size(), size_t(1));
1226  const auto element_count =
1227  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_), 9);
1228 
1229  auto compute_packed_offset = [](const int32_t x, const int32_t y) -> uint64_t {
1230  const uint64_t y_shifted = static_cast<uint64_t>(y) << 32;
1231  return y_shifted | static_cast<uint32_t>(x);
1232  };
1233 
1234  const auto values_arr = std::vector<llvm::Constant*>{
1235  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_), 0),
1236  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1237  compute_packed_offset(0, 1)),
1238  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1239  compute_packed_offset(0, -1)),
1240  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1241  compute_packed_offset(1, 0)),
1242  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1243  compute_packed_offset(1, 1)),
1244  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1245  compute_packed_offset(1, -1)),
1246  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1247  compute_packed_offset(-1, 0)),
1248  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1249  compute_packed_offset(-1, 1)),
1250  llvm::ConstantInt::get(get_int_type(64, cgen_state_->context_),
1251  compute_packed_offset(-1, -1))};
1252 
1253  const auto constant_values_array = llvm::ConstantArray::get(
1254  get_int_array_type(64, 9, cgen_state_->context_), values_arr);
1255  CHECK(cgen_state_->module_);
1256  const auto values =
1257  new llvm::GlobalVariable(*cgen_state_->module_,
1258  get_int_array_type(64, 9, cgen_state_->context_),
1259  true,
1260  llvm::GlobalValue::LinkageTypes::InternalLinkage,
1261  constant_values_array);
1262  JoinLoop join_loop(
1265  [element_count, values](const std::vector<llvm::Value*>& v) {
1266  JoinLoopDomain domain{{0}};
1267  domain.element_count = element_count;
1268  domain.values_buffer = values;
1269  return domain;
1270  },
1271  nullptr,
1272  nullptr,
1273  nullptr,
1274  nullptr,
1275  "range_key_loop");
1276 
1277  loops_entry_bb = JoinLoop::codegen(
1278  {join_loop},
1279  [this,
1280  query_func,
1281  &query_mem_desc,
1282  &co,
1283  &eo,
1284  &group_by_and_aggregate,
1285  &join_loops,
1286  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
1287  auto& builder = cgen_state_->ir_builder_;
1288 
1289  auto body_exit_bb =
1290  llvm::BasicBlock::Create(cgen_state_->context_,
1291  "range_key_inner_body_exit",
1292  builder.GetInsertBlock()->getParent());
1293 
1294  auto range_key_body_bb =
1295  llvm::BasicBlock::Create(cgen_state_->context_,
1296  "range_key_loop_body",
1297  builder.GetInsertBlock()->getParent());
1298  builder.SetInsertPoint(range_key_body_bb);
1299 
1300  const auto body_loops_entry_bb = JoinLoop::codegen(
1301  join_loops,
1302  [this,
1303  query_func,
1304  &query_mem_desc,
1305  &co,
1306  &eo,
1307  &group_by_and_aggregate,
1308  &join_loops,
1309  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
1310  addJoinLoopIterator(prev_iters, join_loops.size());
1311  auto& builder = cgen_state_->ir_builder_;
1312  const auto loop_body_bb =
1313  llvm::BasicBlock::Create(builder.getContext(),
1314  "loop_body",
1315  builder.GetInsertBlock()->getParent());
1316  builder.SetInsertPoint(loop_body_bb);
1317  const bool can_return_error =
1318  compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
1319  if (can_return_error || cgen_state_->needs_error_check_ ||
1320  eo.with_dynamic_watchdog || eo.allow_runtime_query_interrupt) {
1321  createErrorCheckControlFlow(query_func,
1322  eo.with_dynamic_watchdog,
1323  eo.allow_runtime_query_interrupt,
1324  join_loops,
1325  co.device_type,
1326  group_by_and_aggregate.query_infos_);
1327  }
1328  return loop_body_bb;
1329  },
1330  prev_iters.back(),
1331  body_exit_bb,
1332  cgen_state_.get());
1333 
1334  builder.SetInsertPoint(range_key_body_bb);
1335  cgen_state_->ir_builder_.CreateBr(body_loops_entry_bb);
1336 
1337  builder.SetInsertPoint(body_exit_bb);
1338  return range_key_body_bb;
1339  },
1340  code_generator.posArg(nullptr),
1341  exit_bb,
1342  cgen_state_.get());
1343  } else {
1344  loops_entry_bb = JoinLoop::codegen(
1345  join_loops,
1346  /*body_codegen=*/
1347  [this,
1348  query_func,
1349  &query_mem_desc,
1350  &co,
1351  &eo,
1352  &group_by_and_aggregate,
1353  &join_loops,
1354  &ra_exe_unit](const std::vector<llvm::Value*>& prev_iters) {
1356  addJoinLoopIterator(prev_iters, join_loops.size());
1357  auto& builder = cgen_state_->ir_builder_;
1358  const auto loop_body_bb = llvm::BasicBlock::Create(
1359  builder.getContext(), "loop_body", builder.GetInsertBlock()->getParent());
1360  builder.SetInsertPoint(loop_body_bb);
1361  const bool can_return_error =
1362  compileBody(ra_exe_unit, group_by_and_aggregate, query_mem_desc, co);
1363  if (can_return_error || cgen_state_->needs_error_check_ ||
1364  eo.with_dynamic_watchdog || eo.allow_runtime_query_interrupt) {
1365  createErrorCheckControlFlow(query_func,
1366  eo.with_dynamic_watchdog,
1367  eo.allow_runtime_query_interrupt,
1368  join_loops,
1369  co.device_type,
1370  group_by_and_aggregate.query_infos_);
1371  }
1372  return loop_body_bb;
1373  },
1374  /*outer_iter=*/code_generator.posArg(nullptr),
1375  exit_bb,
1376  cgen_state_.get());
1377  }
1378  CHECK(loops_entry_bb);
1379  cgen_state_->ir_builder_.SetInsertPoint(entry_bb);
1380  cgen_state_->ir_builder_.CreateBr(loops_entry_bb);
1381 }
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
#define CHECK_EQ(x, y)
Definition: Logger.h:301
llvm::Value * element_count
Definition: JoinLoop.h:46
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
static llvm::BasicBlock * codegen(const std::vector< JoinLoop > &join_loops, const std::function< llvm::BasicBlock *(const std::vector< llvm::Value * > &)> &body_codegen, llvm::Value *outer_iter, llvm::BasicBlock *exit_bb, CgenState *cgen_state)
Definition: JoinLoop.cpp:50
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * addJoinLoopIterator(const std::vector< llvm::Value * > &prev_iters, const size_t level_idx)
Definition: IRCodegen.cpp:1185
#define CHECK(condition)
Definition: Logger.h:291
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
llvm::ArrayType * get_int_array_type(int const width, int count, llvm::LLVMContext &context)

+ Here is the call graph for this function:

llvm::Value * Executor::codegenLoadCurrentValueFromColBuf ( WindowFunctionContext window_func_context,
CodeGenerator code_generator,
WindowFrameBoundFuncArgs args 
) const
private

Definition at line 743 of file WindowFunctionIR.cpp.

References CHECK, CodeGenerator::codegenWindowPosition(), WindowFrameBoundFuncArgs::current_row_pos_lv, get_fp_type(), get_int_type(), Analyzer::WindowFunction::getOrderKeys(), WindowFunctionContext::getWindowFunction(), Analyzer::WindowFunction::isFrameNavigateWindowFunction(), and WindowFrameBoundFuncArgs::order_key_buf_ptr_lv.

746  {
747  llvm::Value* current_col_value_ptr_lv{nullptr};
748  const auto order_key_size_in_byte = getOrderKeySize(window_func_context) * 8;
749  auto const order_key_ptr =
750  window_func_context->getWindowFunction()->getOrderKeys().front();
751  CHECK(order_key_ptr);
752  auto const order_col_ti = order_key_ptr->get_type_info();
753  auto const order_col_llvm_type =
754  order_col_ti.is_fp() ? get_fp_type(order_key_size_in_byte, cgen_state_->context_)
755  : get_int_type(order_key_size_in_byte, cgen_state_->context_);
756  if (!window_func_context->getWindowFunction()->isFrameNavigateWindowFunction()) {
757  auto rowid_in_partition_lv = code_generator.codegenWindowPosition(
758  window_func_context, args.current_row_pos_lv);
759  current_col_value_ptr_lv = cgen_state_->ir_builder_.CreateGEP(
760  order_col_llvm_type, args.order_key_buf_ptr_lv, rowid_in_partition_lv);
761  } else {
762  current_col_value_ptr_lv = cgen_state_->ir_builder_.CreateGEP(
763  order_col_llvm_type, args.order_key_buf_ptr_lv, args.current_row_pos_lv);
764  }
765  return cgen_state_->ir_builder_.CreateLoad(
766  current_col_value_ptr_lv->getType()->getPointerElementType(),
767  current_col_value_ptr_lv,
768  "current_col_value");
769 }
bool isFrameNavigateWindowFunction() const
Definition: Analyzer.h:2848
llvm::Value * current_row_pos_lv
Definition: WindowContext.h:90
llvm::Type * get_fp_type(const int width, llvm::LLVMContext &context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
const std::vector< std::shared_ptr< Analyzer::Expr > > & getOrderKeys() const
Definition: Analyzer.h:2802
llvm::Value * codegenWindowPosition(const WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:235
std::unique_ptr< llvm::LLVMContext > context_
Definition: Execute.h:1477
llvm::Value * order_key_buf_ptr_lv
Definition: WindowContext.h:96
#define CHECK(condition)
Definition: Logger.h:291
const Analyzer::WindowFunction * getWindowFunction() const
size_t getOrderKeySize(WindowFunctionContext *window_func_context) const

+ Here is the call graph for this function:

std::pair< std::string, llvm::Value * > Executor::codegenLoadOrderKeyBufPtr ( WindowFunctionContext window_func_context,
CodeGenerator code_generator,
const CompilationOptions co 
) const
private

Definition at line 938 of file WindowFunctionIR.cpp.

References CodegenUtil::createPtrWithHoistedMemoryAddr(), anonymous_namespace{WindowFunctionIR.cpp}::get_col_type_name_by_size(), get_fp_type(), get_int_type(), WindowFunctionContext::getOrderKeyColumnBuffers(), WindowFunctionContext::getOrderKeyColumnBufferTypes(), Analyzer::WindowFunction::getOrderKeys(), WindowFunctionContext::getWindowFunction(), and WindowFunctionContext::NUM_EXECUTION_DEVICES.

941  {
942  auto const order_key_ti =
943  window_func_context->getWindowFunction()->getOrderKeys().front()->get_type_info();
944  auto const order_key_size = order_key_ti.get_size();
945  auto const order_col_type_name = get_col_type_name_by_size(
946  order_key_size,
947  window_func_context->getOrderKeyColumnBufferTypes().front().is_fp());
948  size_t order_key_size_in_byte = order_key_size * 8;
949  auto const order_key_type =
950  order_key_ti.is_fp() ? get_fp_type(order_key_size_in_byte, cgen_state_->context_)
951  : get_int_type(order_key_size_in_byte, cgen_state_->context_);
952  auto const order_key_buf_type = llvm::PointerType::get(order_key_type, 0);
953  auto const order_key_buf = cgen_state_->llInt(
954  reinterpret_cast<int64_t>(window_func_context->getOrderKeyColumnBuffers().front()));
955  auto const order_key_buf_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
956  cgen_state_.get(),
957  code_generator,
958  co,
959  order_key_buf,
960  order_key_buf_type,
962  .front();
963  return std::make_pair(order_col_type_name, order_key_buf_ptr_lv);
964 }
std::string get_col_type_name_by_size(const size_t size, const bool is_fp)
const std::vector< SQLTypeInfo > & getOrderKeyColumnBufferTypes() const
llvm::Type * get_fp_type(const int width, llvm::LLVMContext &context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
const std::vector< std::shared_ptr< Analyzer::Expr > > & getOrderKeys() const
Definition: Analyzer.h:2802
static const int NUM_EXECUTION_DEVICES
const std::vector< const int8_t * > & getOrderKeyColumnBuffers() const
std::unique_ptr< llvm::LLVMContext > context_
Definition: Execute.h:1477
const Analyzer::WindowFunction * getWindowFunction() const
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)

+ Here is the call graph for this function:

WindowPartitionBufferPtrs Executor::codegenLoadPartitionBuffers ( WindowFunctionContext window_func_context,
CodeGenerator code_generator,
const CompilationOptions co,
llvm::Value *  partition_index_lv 
) const
private

Definition at line 966 of file WindowFunctionIR.cpp.

References WindowFunctionContext::counts(), CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowPartitionBufferPtrs::current_partition_start_offset_lv, get_int_type(), WindowPartitionBufferPtrs::num_elem_current_partition_lv, WindowFunctionContext::NUM_EXECUTION_DEVICES, WindowFunctionContext::partitionStartOffset(), WindowFunctionContext::payload(), WindowFunctionContext::sortedPartition(), WindowPartitionBufferPtrs::target_partition_rowid_ptr_lv, and WindowPartitionBufferPtrs::target_partition_sorted_rowid_ptr_lv.

970  {
971  WindowPartitionBufferPtrs bufferPtrs;
972  const auto pi64_type =
973  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
974  const auto pi32_type =
975  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
976 
977  // partial sum of # elems of partitions
978  auto partition_start_offset_buf_lv = cgen_state_->llInt(
979  reinterpret_cast<int64_t>(window_func_context->partitionStartOffset()));
980  auto partition_start_offset_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
981  cgen_state_.get(),
982  code_generator,
983  co,
984  partition_start_offset_buf_lv,
985  pi64_type,
987  .front();
988 
989  // get start offset of the current partition
990  auto current_partition_start_offset_ptr_lv =
991  cgen_state_->ir_builder_.CreateGEP(get_int_type(64, cgen_state_->context_),
992  partition_start_offset_ptr_lv,
993  partition_index_lv);
994  bufferPtrs.current_partition_start_offset_lv = cgen_state_->ir_builder_.CreateLoad(
995  current_partition_start_offset_ptr_lv->getType()->getPointerElementType(),
996  current_partition_start_offset_ptr_lv);
997 
998  // row_id buf of the current partition
999  const auto partition_rowid_buf_lv =
1000  cgen_state_->llInt(reinterpret_cast<int64_t>(window_func_context->payload()));
1001  const auto partition_rowid_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
1002  cgen_state_.get(),
1003  code_generator,
1004  co,
1005  partition_rowid_buf_lv,
1006  pi32_type,
1008  .front();
1009  bufferPtrs.target_partition_rowid_ptr_lv =
1010  cgen_state_->ir_builder_.CreateGEP(get_int_type(32, cgen_state_->context_),
1011  partition_rowid_ptr_lv,
1013 
1014  // row_id buf of ordered current partition
1015  const auto sorted_rowid_lv = cgen_state_->llInt(
1016  reinterpret_cast<int64_t>(window_func_context->sortedPartition()));
1017  const auto sorted_rowid_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
1018  cgen_state_.get(),
1019  code_generator,
1020  co,
1021  sorted_rowid_lv,
1022  pi64_type,
1024  .front();
1026  cgen_state_->ir_builder_.CreateGEP(get_int_type(64, cgen_state_->context_),
1027  sorted_rowid_ptr_lv,
1029 
1030  // # elems per partition
1031  const auto partition_count_buf =
1032  cgen_state_->llInt(reinterpret_cast<int64_t>(window_func_context->counts()));
1033  auto partition_count_buf_ptr_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
1034  cgen_state_.get(),
1035  code_generator,
1036  co,
1037  partition_count_buf,
1038  pi32_type,
1040  .front();
1041 
1042  // # elems of the given partition
1043  const auto num_elem_current_partition_ptr =
1044  cgen_state_->ir_builder_.CreateGEP(get_int_type(32, cgen_state_->context_),
1045  partition_count_buf_ptr_lv,
1046  partition_index_lv);
1047  bufferPtrs.num_elem_current_partition_lv = cgen_state_->castToTypeIn(
1048  cgen_state_->ir_builder_.CreateLoad(
1049  num_elem_current_partition_ptr->getType()->getPointerElementType(),
1050  num_elem_current_partition_ptr),
1051  64);
1052  return bufferPtrs;
1053 }
llvm::Value * current_partition_start_offset_lv
llvm::Value * num_elem_current_partition_lv
const int32_t * counts() const
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Value * target_partition_sorted_rowid_ptr_lv
llvm::Value * target_partition_rowid_ptr_lv
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
const int64_t * partitionStartOffset() const
static const int NUM_EXECUTION_DEVICES
const int64_t * sortedPartition() const
const int32_t * payload() const
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)

+ Here is the call graph for this function:

llvm::BasicBlock * Executor::codegenSkipDeletedOuterTableRow ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co 
)
private

Definition at line 3312 of file NativeCodegen.cpp.

3314  {
3316  if (!co.filter_on_deleted_column) {
3317  return nullptr;
3318  }
3319  CHECK(!ra_exe_unit.input_descs.empty());
3320  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
3321  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
3322  return nullptr;
3323  }
3324  const auto& table_key = outer_input_desc.getTableKey();
3325  const auto deleted_cd = plan_state_->getDeletedColForTable(table_key);
3326  if (!deleted_cd) {
3327  return nullptr;
3328  }
3329  CHECK(deleted_cd->columnType.is_boolean());
3330  const auto deleted_expr =
3331  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
3332  shared::ColumnKey{table_key, deleted_cd->columnId},
3333  outer_input_desc.getNestLevel());
3334  CodeGenerator code_generator(this);
3335  const auto is_deleted =
3336  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
3337  const auto is_deleted_bb = llvm::BasicBlock::Create(
3338  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
3339  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3340  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
3341  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3342  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3343  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3344  cgen_state_->ir_builder_.SetInsertPoint(bb);
3345  return bb;
3346 }
std::vector< InputDescriptor > input_descs
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:291
void Executor::codegenWindowAvgEpilogue ( CodeGenerator code_generator,
const CompilationOptions co,
llvm::Value *  crt_val,
llvm::Value *  window_func_null_val 
)
private

Definition at line 1456 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, CodegenUtil::createPtrWithHoistedMemoryAddr(), anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, kFLOAT, and WindowFunctionContext::NUM_EXECUTION_DEVICES.

1459  {
1461  const auto window_func_context =
1463  const auto window_func = window_func_context->getWindowFunction();
1464  const auto window_func_ti = get_adjusted_window_type_info(window_func);
1465  const auto pi32_type =
1466  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
1467  const auto pi64_type =
1468  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
1469  const auto aggregate_state_type =
1470  window_func_ti.get_type() == kFLOAT ? pi32_type : pi64_type;
1471  const auto aggregate_state_count_i64 = cgen_state_->llInt(
1472  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
1473  auto aggregate_state_count = CodegenUtil::createPtrWithHoistedMemoryAddr(
1474  cgen_state_.get(),
1475  code_generator,
1476  co,
1477  aggregate_state_count_i64,
1478  aggregate_state_type,
1480  .front();
1481  std::string agg_count_func_name = "agg_count";
1482  switch (window_func_ti.get_type()) {
1483  case kFLOAT: {
1484  agg_count_func_name += "_float";
1485  break;
1486  }
1487  case kDOUBLE: {
1488  agg_count_func_name += "_double";
1489  break;
1490  }
1491  default: {
1492  break;
1493  }
1494  }
1495  agg_count_func_name += "_skip_val";
1496  cgen_state_->emitCall(agg_count_func_name,
1497  {aggregate_state_count, crt_val, window_func_null_val});
1498 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static const int NUM_EXECUTION_DEVICES
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

std::pair< llvm::Value *, llvm::Value * > Executor::codegenWindowFrameBounds ( WindowFunctionContext window_func_context,
const Analyzer::WindowFrame frame_start_bound,
const Analyzer::WindowFrame frame_end_bound,
llvm::Value *  order_key_col_null_val_lv,
WindowFrameBoundFuncArgs args,
CodeGenerator code_generator 
)
private

Definition at line 1070 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, CHECK, WindowFrameBoundFuncArgs::current_col_value_lv, WindowFunctionContext::getOrderKeyColumnBuffers(), WindowFunctionContext::getWindowFunction(), Analyzer::WindowFrame::hasTimestampTypeFrameBound(), and WindowFrameBoundFuncArgs::order_type_col_name.

1076  {
1077  const auto window_func = window_func_context->getWindowFunction();
1078  CHECK(window_func);
1079  const auto is_timestamp_type_frame = frame_start_bound->hasTimestampTypeFrameBound() ||
1080  frame_end_bound->hasTimestampTypeFrameBound();
1081 
1082  if (window_func->hasRangeModeFraming()) {
1083  CHECK(window_func_context->getOrderKeyColumnBuffers().size() == 1);
1084  CHECK(window_func->getOrderKeys().size() == 1UL);
1085  CHECK(window_func_context->getOrderKeyColumnBuffers().size() == 1UL);
1086  args.order_type_col_name = getOrderKeyTypeName(window_func_context);
1087  args.current_col_value_lv =
1088  codegenLoadCurrentValueFromColBuf(window_func_context, code_generator, args);
1089  }
1090 
1091  auto get_order_key_null_val = [is_timestamp_type_frame,
1092  &order_key_col_null_val_lv,
1093  this](const Analyzer::WindowFrame* frame_bound) {
1094  return is_timestamp_type_frame && !frame_bound->isCurrentRowBound()
1095  ? cgen_state_->castToTypeIn(order_key_col_null_val_lv, 64)
1096  : order_key_col_null_val_lv;
1097  };
1098  auto frame_start_bound_lv =
1099  codegenFrameBound(true,
1100  window_func->hasRangeModeFraming(),
1101  window_func->isFrameNavigateWindowFunction(),
1102  frame_start_bound,
1103  is_timestamp_type_frame,
1104  get_order_key_null_val(frame_start_bound),
1105  args);
1106  auto frame_end_bound_lv =
1107  codegenFrameBound(false,
1108  window_func->hasRangeModeFraming(),
1109  window_func->isFrameNavigateWindowFunction(),
1110  frame_end_bound,
1111  is_timestamp_type_frame,
1112  get_order_key_null_val(frame_end_bound),
1113  args);
1114  CHECK(frame_start_bound_lv);
1115  CHECK(frame_end_bound_lv);
1116  return std::make_pair(frame_start_bound_lv, frame_end_bound_lv);
1117 }
bool hasTimestampTypeFrameBound() const
Definition: Analyzer.h:2702
llvm::Value * current_col_value_lv
Definition: WindowContext.h:91
llvm::Value * codegenFrameBound(bool for_start_bound, bool for_range_mode, bool for_window_frame_naviation, const Analyzer::WindowFrame *frame_bound, bool is_timestamp_type_frame, llvm::Value *order_key_null_val, const WindowFrameBoundFuncArgs &args)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Value * codegenLoadCurrentValueFromColBuf(WindowFunctionContext *window_func_context, CodeGenerator &code_generator, WindowFrameBoundFuncArgs &args) const
const std::string getOrderKeyTypeName(WindowFunctionContext *window_func_context) const
const std::vector< const int8_t * > & getOrderKeyColumnBuffers() const
#define CHECK(condition)
Definition: Logger.h:291
const Analyzer::WindowFunction * getWindowFunction() const
std::string order_type_col_name
Definition: WindowContext.h:97

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunction ( const size_t  target_index,
const CompilationOptions co 
)
private

Definition at line 22 of file WindowFunctionIR.cpp.

References WindowProjectNodeContext::activateWindowFunctionContext(), run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, BACKWARD_FILL, CHECK, CHECK_EQ, CONDITIONAL_CHANGE_EVENT, COUNT, COUNT_IF, CUME_DIST, DENSE_RANK, logger::FATAL, FIRST_VALUE, FIRST_VALUE_IN_FRAME, FORWARD_FILL, WindowProjectNodeContext::get(), WindowFunctionContext::getWindowFunction(), LAG, LAG_IN_FRAME, LAST_VALUE, LAST_VALUE_IN_FRAME, LEAD, LEAD_IN_FRAME, LOG, MAX, MIN, NTH_VALUE, NTH_VALUE_IN_FRAME, NTILE, PERCENT_RANK, RANK, ROW_NUMBER, SUM, and SUM_IF.

23  {
25  CodeGenerator code_generator(this);
26 
27  const auto window_func_context =
29  target_index);
30  const auto window_func = window_func_context->getWindowFunction();
31  switch (window_func->getKind()) {
36  return code_generator.codegenWindowPosition(window_func_context,
37  code_generator.posArg(nullptr));
40  return cgen_state_->emitCall("percent_window_func",
41  {cgen_state_->llInt(reinterpret_cast<const int64_t>(
42  window_func_context->output())),
43  code_generator.posArg(nullptr)});
49  // they are always evaluated on the current frame
51  const auto& args = window_func->getArgs();
52  CHECK(!args.empty());
53  const auto arg_lvs = code_generator.codegen(args.front().get(), true, co);
54  CHECK_EQ(arg_lvs.size(), size_t(1));
55  return arg_lvs.front();
56  }
65  return codegenWindowFunctionAggregate(&code_generator, co);
74  default:
75  LOG(FATAL) << "Invalid window function kind";
76  }
77  return nullptr;
78 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define LOG(tag)
Definition: Logger.h:285
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
static const WindowProjectNodeContext * get(Executor *executor)
const WindowFunctionContext * activateWindowFunctionContext(Executor *executor, const size_t target_index) const
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::Value * codegenWindowFunctionAggregate(CodeGenerator *code_generator, const CompilationOptions &co)
llvm::Value * codegenWindowNavigationFunctionOnFrame(const CompilationOptions &co)
#define CHECK(condition)
Definition: Logger.h:291
const Analyzer::WindowFunction * getWindowFunction() const

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregate ( CodeGenerator code_generator,
const CompilationOptions co 
)
private

Definition at line 255 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, AVG, CHECK, CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowProjectNodeContext::get(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), and WindowFunctionContext::NUM_EXECUTION_DEVICES.

256  {
258  auto [reset_state_false_bb, aggregate_state] =
259  codegenWindowResetStateControlFlow(code_generator, co);
260  llvm::Value* aggregate_state_count = nullptr;
261  const auto window_func_context =
263  const auto window_func = window_func_context->getWindowFunction();
264  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
265  const auto aggregate_state_count_i64 = cgen_state_->llInt(
266  reinterpret_cast<const int64_t>(window_func_context->aggregateStateCount()));
267  const auto pi64_type =
268  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
269  aggregate_state_count = CodegenUtil::createPtrWithHoistedMemoryAddr(
270  cgen_state_.get(),
271  code_generator,
272  co,
273  aggregate_state_count_i64,
274  pi64_type,
276  .front();
277  }
278  codegenWindowFunctionStateInit(code_generator, co, aggregate_state);
279  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
280  const auto count_zero = cgen_state_->llInt(int64_t(0));
281  cgen_state_->emitCall("agg_id", {aggregate_state_count, count_zero});
282  }
283  cgen_state_->ir_builder_.CreateBr(reset_state_false_bb);
284  cgen_state_->ir_builder_.SetInsertPoint(reset_state_false_bb);
286  return codegenWindowFunctionAggregateCalls(aggregate_state, co);
287 }
void codegenWindowFunctionStateInit(CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *aggregate_state)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
static const WindowProjectNodeContext * get(Executor *executor)
std::pair< llvm::BasicBlock *, llvm::Value * > codegenWindowResetStateControlFlow(CodeGenerator *code_generator, const CompilationOptions &co)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static const int NUM_EXECUTION_DEVICES
#define CHECK(condition)
Definition: Logger.h:291
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)
llvm::Value * codegenWindowFunctionAggregateCalls(llvm::Value *aggregate_state, const CompilationOptions &co)

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowFunctionAggregateCalls ( llvm::Value *  aggregate_state,
const CompilationOptions co 
)
private

Definition at line 1119 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, AVG, CHECK, CHECK_EQ, CodeGenerator::codegen(), CodeGenerator::codegenCastBetweenIntTypes(), COUNT, COUNT_IF, CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowFrameBoundFuncArgs::current_partition_start_offset_lv, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), anonymous_namespace{WindowFunctionIR.cpp}::get_col_type_name_by_size(), get_int_type(), anonymous_namespace{WindowFunctionIR.cpp}::get_null_value_by_size(), anonymous_namespace{WindowFunctionIR.cpp}::get_window_agg_name(), WindowProjectNodeContext::getActiveWindowFunctionContext(), inline_fixed_encoding_null_val(), kDATE, kDOUBLE, kENCODING_DATE_IN_DAYS, kENCODING_FIXED, kFLOAT, kSUM_IF, kTIME, kTIMESTAMP, kTINYINT, MAX, MIN, WindowFunctionContext::NUM_EXECUTION_DEVICES, CodeGenerator::posArg(), SUM, SUM_IF, and window_function_conditional_aggregate().

1120  {
1122  const auto window_func_context =
1124  const auto window_func = window_func_context->getWindowFunction();
1125  const auto window_func_ti = get_adjusted_window_type_info(window_func);
1126  const auto window_func_null_val =
1127  window_func_ti.is_fp()
1128  ? cgen_state_->inlineFpNull(window_func_ti)
1129  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
1130  if (window_func_context->elementCount() == 0) {
1131  // we do not need to generate a code for an empty input table
1132  return window_func->getKind() == SqlWindowFunctionKind::AVG
1133  ? cgen_state_->inlineFpNull(SQLTypeInfo(SQLTypes::kDOUBLE))
1134  : window_func_null_val;
1135  }
1136  const auto& args = window_func->getArgs();
1137  CodeGenerator code_generator(this);
1138  if (window_func_context->needsToBuildAggregateTree()) {
1139  // compute an aggregated value for each row of the window frame by using segment
1140  // tree when constructing a window context, we build a necessary segment tree (so
1141  // called `aggregate tree`) to query the aggregated value of the specific window
1142  // frame
1143  const auto pi64_type =
1144  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0);
1145  const auto ppi64_type = llvm::PointerType::get(
1146  llvm::PointerType::get(get_int_type(64, cgen_state_->context_), 0), 0);
1147 
1148  auto [frame_start_bound_expr_lv, frame_end_bound_expr_lv] =
1149  codegenFrameBoundRange(window_func, code_generator, co);
1150 
1151  // compute aggregated value over the computed frame range
1152  auto current_row_pos_lv = code_generator.posArg(nullptr);
1153  auto partition_index_lv = codegenCurrentPartitionIndex(
1154  window_func_context, &code_generator, co, current_row_pos_lv);
1155 
1156  // ordering column buffer
1157  const auto target_col_ti = args.front()->get_type_info();
1158  const auto target_col_size = target_col_ti.get_size();
1159  const auto col_type_name =
1160  get_col_type_name_by_size(target_col_size, target_col_ti.is_fp());
1161 
1162  const auto partition_buf_ptrs = codegenLoadPartitionBuffers(
1163  window_func_context, &code_generator, co, partition_index_lv);
1164 
1165  auto [order_col_type_name, order_key_buf_ptr_lv] =
1166  codegenLoadOrderKeyBufPtr(window_func_context, &code_generator, co);
1167 
1168  // null value of the ordering column
1169  const auto order_key_buf_ti =
1170  window_func_context->getOrderKeyColumnBufferTypes().front();
1171  auto const ordering_spec = window_func->getCollation().front();
1172  llvm::Value* order_key_col_null_val_lv{nullptr};
1173  switch (order_key_buf_ti.get_type()) {
1174  case kDATE:
1175  case kTIMESTAMP:
1176  case kTIME: {
1177  if (order_key_buf_ti.get_compression() == kENCODING_FIXED ||
1178  order_key_buf_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
1179  auto null_val = inline_fixed_encoding_null_val(order_key_buf_ti);
1180  order_key_col_null_val_lv = cgen_state_->llInt((int32_t)null_val);
1181  break;
1182  }
1183  }
1184  default: {
1185  order_key_col_null_val_lv = cgen_state_->inlineNull(order_key_buf_ti);
1186  break;
1187  }
1188  }
1189 
1190  auto [null_start_pos_lv, null_end_pos_lv] = codegenFrameNullRange(
1191  window_func_context, &code_generator, co, partition_index_lv);
1192  auto nulls_first_lv = cgen_state_->llBool(ordering_spec.nulls_first);
1193 
1195  frame_start_bound_expr_lv,
1196  frame_end_bound_expr_lv,
1197  current_row_pos_lv,
1198  nullptr,
1199  partition_buf_ptrs.current_partition_start_offset_lv,
1200  cgen_state_->llInt((int64_t)0),
1201  cgen_state_->llInt((int64_t)1),
1202  partition_buf_ptrs.num_elem_current_partition_lv,
1203  order_key_buf_ptr_lv,
1204  "",
1205  partition_buf_ptrs.target_partition_rowid_ptr_lv,
1206  partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
1207  nulls_first_lv,
1208  null_start_pos_lv,
1209  null_end_pos_lv};
1210  auto [frame_start_bound_lv, frame_end_bound_lv] =
1211  codegenWindowFrameBounds(window_func_context,
1212  window_func->getFrameStartBound(),
1213  window_func->getFrameEndBound(),
1214  order_key_col_null_val_lv,
1216  code_generator);
1217 
1218  // codegen to send a query with frame bound to aggregate tree searcher
1219  llvm::ConstantInt* aggregation_trees_lv{nullptr};
1220  llvm::Value* invalid_val_lv{nullptr};
1221  llvm::Value* null_val_lv{nullptr};
1222  std::string aggregation_tree_search_func_name{"search_"};
1223  std::string aggregation_tree_getter_func_name{"get_"};
1224 
1225  // prepare null values and aggregate_tree getter and searcher depending on
1226  // a type of the ordering column
1227  auto agg_expr_ti = args.front()->get_type_info();
1228  if (agg_expr_ti.is_fp()) {
1229  if (window_func->getKind() == SqlWindowFunctionKind::MIN) {
1230  invalid_val_lv = cgen_state_->llFp(std::numeric_limits<double>::max());
1231  } else if (window_func->getKind() == SqlWindowFunctionKind::MAX) {
1232  invalid_val_lv = cgen_state_->llFp(std::numeric_limits<double>::lowest());
1233  } else {
1234  invalid_val_lv = cgen_state_->llFp((double)0);
1235  }
1236  null_val_lv = cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE));
1237  aggregation_tree_search_func_name += "double";
1238  aggregation_tree_getter_func_name += "double";
1239  } else {
1240  if (window_func->getKind() == SqlWindowFunctionKind::MIN) {
1241  invalid_val_lv = cgen_state_->llInt(std::numeric_limits<int64_t>::max());
1242  } else if (window_func->getKind() == SqlWindowFunctionKind::MAX) {
1243  invalid_val_lv = cgen_state_->llInt(std::numeric_limits<int64_t>::lowest());
1244  } else {
1245  invalid_val_lv = cgen_state_->llInt((int64_t)0);
1246  }
1247  null_val_lv = cgen_state_->llInt(inline_int_null_value<int64_t>());
1248  aggregation_tree_search_func_name += "int64_t";
1249  aggregation_tree_getter_func_name += "integer";
1250  }
1251 
1252  // derived aggregation has a different code path
1253  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
1254  aggregation_tree_search_func_name += "_derived";
1255  aggregation_tree_getter_func_name += "_derived";
1256  }
1257 
1258  // get a buffer holding aggregate trees for each partition
1259  if (agg_expr_ti.is_fp()) {
1260  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
1261  aggregation_trees_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
1262  window_func_context->getDerivedAggregationTreesForDoubleTypeWindowExpr()));
1263  } else {
1264  aggregation_trees_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
1265  window_func_context->getAggregationTreesForDoubleTypeWindowExpr()));
1266  }
1267  } else {
1268  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
1269  aggregation_trees_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
1270  window_func_context->getDerivedAggregationTreesForIntegerTypeWindowExpr()));
1271  } else {
1272  aggregation_trees_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
1273  window_func_context->getAggregationTreesForIntegerTypeWindowExpr()));
1274  }
1275  }
1276 
1277  CHECK(aggregation_trees_lv);
1278  CHECK(invalid_val_lv);
1279  aggregation_tree_search_func_name += "_aggregation_tree";
1280  aggregation_tree_getter_func_name += "_aggregation_tree";
1281 
1282  // get the aggregate tree of the current partition from a window context
1283  auto aggregation_trees_ptr = CodegenUtil::createPtrWithHoistedMemoryAddr(
1284  cgen_state_.get(),
1285  &code_generator,
1286  co,
1287  aggregation_trees_lv,
1288  ppi64_type,
1290  .front();
1291  auto target_aggregation_tree_lv = cgen_state_->emitCall(
1292  aggregation_tree_getter_func_name, {aggregation_trees_ptr, partition_index_lv});
1293 
1294  // a depth of segment tree
1295  const auto tree_depth_buf = cgen_state_->llInt(
1296  reinterpret_cast<int64_t>(window_func_context->getAggregateTreeDepth()));
1297  const auto tree_depth_buf_ptr = CodegenUtil::createPtrWithHoistedMemoryAddr(
1298  cgen_state_.get(),
1299  &code_generator,
1300  co,
1301  tree_depth_buf,
1302  pi64_type,
1304  .front();
1305  const auto current_partition_tree_depth_buf_ptr = cgen_state_->ir_builder_.CreateGEP(
1306  get_int_type(64, cgen_state_->context_), tree_depth_buf_ptr, partition_index_lv);
1307  const auto current_partition_tree_depth_lv = cgen_state_->ir_builder_.CreateLoad(
1308  current_partition_tree_depth_buf_ptr->getType()->getPointerElementType(),
1309  current_partition_tree_depth_buf_ptr);
1310 
1311  // a fanout of the current partition's segment tree
1312  const auto aggregation_tree_fanout_lv = cgen_state_->llInt(
1313  static_cast<int64_t>(window_func_context->getAggregateTreeFanout()));
1314 
1315  // agg_type
1316  const auto agg_type_lv =
1317  cgen_state_->llInt(static_cast<int32_t>(window_func->getKind()));
1318 
1319  // send a query to the aggregate tree with the frame range:
1320  // `frame_start_bound_lv` ~ `frame_end_bound_lv`
1321  auto res_lv =
1322  cgen_state_->emitCall(aggregation_tree_search_func_name,
1323  {target_aggregation_tree_lv,
1324  frame_start_bound_lv,
1325  frame_end_bound_lv,
1326  current_partition_tree_depth_lv,
1327  aggregation_tree_fanout_lv,
1328  cgen_state_->llBool(agg_expr_ti.is_decimal()),
1329  cgen_state_->llInt((int64_t)agg_expr_ti.get_scale()),
1330  invalid_val_lv,
1331  null_val_lv,
1332  agg_type_lv});
1333 
1334  // handling returned null value if exists
1335  std::string null_handler_func_name{"handle_null_val_"};
1336  std::vector<llvm::Value*> null_handler_args{res_lv, null_val_lv};
1337 
1338  // determine null_handling function's name
1339  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
1340  // average aggregate function returns a value as a double
1341  // (and our search* function also returns a double)
1342  if (agg_expr_ti.is_fp()) {
1343  // fp type: double null value
1344  null_handler_func_name += "double_double";
1345  } else {
1346  // non-fp type: int64_t null type
1347  null_handler_func_name += "double_int64_t";
1348  }
1349  } else if (agg_expr_ti.is_fp()) {
1350  // fp type: double null value
1351  null_handler_func_name += "double_double";
1352  } else {
1353  // non-fp type: int64_t null type
1354  null_handler_func_name += "int64_t_int64_t";
1355  }
1356  null_handler_func_name += "_window_framing_agg";
1357 
1358  // prepare null_val
1359  if (window_func->getKind() == SqlWindowFunctionKind::COUNT) {
1360  if (agg_expr_ti.is_fp()) {
1361  null_handler_args.push_back(cgen_state_->llFp((double)0));
1362  } else {
1363  null_handler_args.push_back(cgen_state_->llInt((int64_t)0));
1364  }
1365  } else if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
1366  null_handler_args.push_back(cgen_state_->inlineFpNull(SQLTypeInfo(kDOUBLE)));
1367  } else {
1368  null_handler_args.push_back(cgen_state_->castToTypeIn(window_func_null_val, 64));
1369  }
1370  res_lv = cgen_state_->emitCall(null_handler_func_name, null_handler_args);
1371 
1372  // when AGG_TYPE is double, we get a double type return value we expect an integer
1373  // type value for the count aggregation
1374  if (window_func->getKind() == SqlWindowFunctionKind::COUNT && agg_expr_ti.is_fp()) {
1375  return cgen_state_->ir_builder_.CreateFPToSI(
1376  res_lv, get_int_type(64, cgen_state_->context_));
1377  } else if (window_func->getKind() != SqlWindowFunctionKind::COUNT &&
1378  agg_expr_ti.is_date_in_days()) {
1379  // we need to decode the "encoded" date column value
1380  auto date_null_val = get_null_value_by_size(cgen_state_.get(), agg_expr_ti);
1381  if (date_null_val->getType()->getScalarSizeInBits() != 32) {
1382  date_null_val = cgen_state_->castToTypeIn(date_null_val, 32);
1383  }
1384  return cgen_state_->emitCall("fixed_width_date_decode",
1385  {res_lv, date_null_val, null_val_lv});
1386  }
1387  return res_lv;
1388  } else {
1389  auto agg_name = get_window_agg_name(window_func->getKind(), window_func_ti);
1390  Analyzer::Expr* arg_target_expr;
1391  std::vector<llvm::Value*> agg_func_args{aggregate_state};
1392  auto modified_window_func_null_val = window_func_null_val;
1393  if (args.empty() ||
1394  (window_func->getKind() == SqlWindowFunctionKind::COUNT &&
1395  dynamic_cast<Analyzer::Constant*>(args.front().get()) != nullptr)) {
1396  // a count aggregation without an expression: COUNT(1) or COUNT(*)
1397  agg_func_args.push_back(cgen_state_->llInt(int64_t(1)));
1398  } else {
1399  // we use #base_agg_func_name##_skip_val agg function
1400  // i.e.,int64_t agg_sum_skip_val(int64_t* agg, int64_t val, int64_t skip_val)
1401  arg_target_expr = args.front().get();
1402  const auto arg_lvs = code_generator.codegen(arg_target_expr, true, co);
1403  CHECK_EQ(arg_lvs.size(), size_t(1));
1404  // handling current row's value
1405  auto crt_val = arg_lvs.front();
1406  if ((window_func->getKind() == SqlWindowFunctionKind::SUM ||
1407  window_func->getKind() == SqlWindowFunctionKind::SUM_IF) &&
1408  !window_func_ti.is_fp()) {
1409  crt_val = code_generator.codegenCastBetweenIntTypes(
1410  arg_lvs.front(), args.front()->get_type_info(), window_func_ti, false);
1411  }
1412  agg_func_args.push_back(window_func_ti.get_type() == kFLOAT
1413  ? crt_val
1414  : cgen_state_->castToTypeIn(crt_val, 64));
1415  // handle null value and conditional value for conditional aggregates if necessary
1416  llvm::Value* cond_lv{nullptr};
1417  if (window_function_conditional_aggregate(window_func->getKind())) {
1418  switch (window_func->getKind()) {
1420  // COUNT_IF has a single condition expr which is always bool type
1421  modified_window_func_null_val = cgen_state_->castToTypeIn(
1422  cgen_state_->inlineNull(SQLTypeInfo(kTINYINT)), 64);
1423  break;
1425  // FP type input col uses its own null value depending on the type
1426  // otherwise (integer type input col), we use 8-byte type
1427  if (args.front()->get_type_info().is_integer()) {
1428  agg_func_args[1] = cgen_state_->castToTypeIn(agg_func_args[1], 64);
1429  // keep the null value but casting its type to 8-byte
1430  modified_window_func_null_val =
1431  cgen_state_->castToTypeIn(window_func_null_val, 64);
1432  }
1433  auto cond_expr_lv = code_generator.codegen(args[1].get(), true, co).front();
1434  cond_lv =
1436  }
1437  default:
1438  break;
1439  }
1440  }
1441  agg_name += "_skip_val";
1442  agg_func_args.push_back(modified_window_func_null_val);
1443  if (cond_lv) {
1444  agg_func_args.push_back(cond_lv);
1445  }
1446  }
1447  cgen_state_->emitCall(agg_name, agg_func_args);
1448  if (window_func->getKind() == SqlWindowFunctionKind::AVG) {
1450  &code_generator, co, agg_func_args[1], window_func_null_val);
1451  }
1452  return codegenAggregateWindowState(&code_generator, co, aggregate_state);
1453  }
1454 }
std::string get_col_type_name_by_size(const size_t size, const bool is_fp)
#define CHECK_EQ(x, y)
Definition: Logger.h:301
Definition: sqltypes.h:76
bool window_function_conditional_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:60
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string get_window_agg_name(const SqlWindowFunctionKind kind, const SQLTypeInfo &window_func_ti)
std::pair< llvm::Value *, llvm::Value * > codegenFrameNullRange(WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *partition_index_lv) const
std::pair< llvm::Value *, llvm::Value * > codegenWindowFrameBounds(WindowFunctionContext *window_func_context, const Analyzer::WindowFrame *frame_start_bound, const Analyzer::WindowFrame *frame_end_bound, llvm::Value *order_key_col_null_val_lv, WindowFrameBoundFuncArgs &args, CodeGenerator &code_generator)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static const int NUM_EXECUTION_DEVICES
llvm::Value * current_partition_start_offset_lv
Definition: WindowContext.h:92
Definition: sqltypes.h:80
void codegenWindowAvgEpilogue(CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *crt_val, llvm::Value *window_func_null_val)
llvm::Value * codegenConditionalAggregateCondValSelector(llvm::Value *cond_lv, SQLAgg const aggKind, CompilationOptions const &co) const
std::pair< llvm::Value *, llvm::Value * > codegenFrameBoundRange(const Analyzer::WindowFunction *window_func, CodeGenerator &code_generator, const CompilationOptions &co)
llvm::Value * codegenCurrentPartitionIndex(const WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *current_row_pos_lv)
#define CHECK(condition)
Definition: Logger.h:291
WindowPartitionBufferPtrs codegenLoadPartitionBuffers(WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *partition_index_lv) const
int64_t inline_fixed_encoding_null_val(const SQL_TYPE_INFO &ti)
std::pair< std::string, llvm::Value * > codegenLoadOrderKeyBufPtr(WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co) const
llvm::Value * get_null_value_by_size(CgenState *cgen_state, SQLTypeInfo col_ti)
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)
llvm::Value * codegenAggregateWindowState(CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *aggregate_state)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

void Executor::codegenWindowFunctionStateInit ( CodeGenerator code_generator,
const CompilationOptions co,
llvm::Value *  aggregate_state 
)
private

Definition at line 329 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, COUNT, COUNT_IF, anonymous_namespace{WindowFunctionIR.cpp}::get_adjusted_window_type_info(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kDOUBLE, and kFLOAT.

331  {
333  const auto window_func_context =
335  const auto window_func = window_func_context->getWindowFunction();
336  const auto window_func_ti = get_adjusted_window_type_info(window_func);
337  const auto window_func_null_val =
338  window_func_ti.is_fp()
339  ? cgen_state_->inlineFpNull(window_func_ti)
340  : cgen_state_->castToTypeIn(cgen_state_->inlineIntNull(window_func_ti), 64);
341  llvm::Value* window_func_init_val;
342  const auto window_func_kind = window_func_context->getWindowFunction()->getKind();
343  if (window_func_kind == SqlWindowFunctionKind::COUNT ||
344  window_func_kind == SqlWindowFunctionKind::COUNT_IF) {
345  switch (window_func_ti.get_type()) {
346  case kFLOAT: {
347  window_func_init_val = cgen_state_->llFp(float(0));
348  break;
349  }
350  case kDOUBLE: {
351  window_func_init_val = cgen_state_->llFp(double(0));
352  break;
353  }
354  default: {
355  window_func_init_val = cgen_state_->llInt(int64_t(0));
356  break;
357  }
358  }
359  } else {
360  window_func_init_val = window_func_null_val;
361  }
362  const auto pi32_type =
363  llvm::PointerType::get(get_int_type(32, cgen_state_->context_), 0);
364  switch (window_func_ti.get_type()) {
365  case kDOUBLE: {
366  cgen_state_->emitCall("agg_id_double", {aggregate_state, window_func_init_val});
367  break;
368  }
369  case kFLOAT: {
370  aggregate_state =
371  cgen_state_->ir_builder_.CreateBitCast(aggregate_state, pi32_type);
372  cgen_state_->emitCall("agg_id_float", {aggregate_state, window_func_init_val});
373  break;
374  }
375  default: {
376  cgen_state_->emitCall("agg_id", {aggregate_state, window_func_init_val});
377  break;
378  }
379  }
380 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLTypeInfo get_adjusted_window_type_info(const Analyzer::WindowFunction *window_func)

+ Here is the call graph for this function:

llvm::Value * Executor::codegenWindowNavigationFunctionOnFrame ( const CompilationOptions co)
private

Definition at line 382 of file WindowFunctionIR.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, CHECK, CodegenUtil::createPtrWithHoistedMemoryAddr(), WindowFrameBoundFuncArgs::current_partition_start_offset_lv, FIRST_VALUE_IN_FRAME, FORWARD_FILL, anonymous_namespace{WindowFunctionIR.cpp}::get_col_type_name_by_size(), get_fp_type(), get_int_type(), anonymous_namespace{WindowFunctionIR.cpp}::get_null_value_by_size(), anonymous_namespace{WindowFunctionIR.cpp}::get_null_value_by_size_with_encoding(), WindowProjectNodeContext::getActiveWindowFunctionContext(), kENCODING_DATE_IN_DAYS, kSecsPerDay, LAG_IN_FRAME, LAST_VALUE_IN_FRAME, LEAD_IN_FRAME, NTH_VALUE_IN_FRAME, WindowFunctionContext::NUM_EXECUTION_DEVICES, and UNREACHABLE.

383  {
385  const auto window_func_context =
387  const auto window_func = window_func_context->getWindowFunction();
388  const auto window_func_kind = window_func->getKind();
389  const auto& args = window_func->getArgs();
390  CHECK(args.size() >= 1 && args.size() <= 3);
391  CodeGenerator code_generator(this);
392 
393  const auto target_col_ti = args.front()->get_type_info();
394  const auto target_col_size = target_col_ti.get_size();
395  const auto target_col_type_name =
396  get_col_type_name_by_size(target_col_size, target_col_ti.is_fp());
397  const auto target_col_logical_type_name = get_col_type_name_by_size(
398  window_func->get_type_info().get_size(), window_func->get_type_info().is_fp());
399 
400  // when target_column is fixed encoded, we store the actual column value by
401  // considering it, but our resultset analyzer only considers the type without encoding
402  // scheme so we handle them separately
403  auto logical_null_val_lv =
404  get_null_value_by_size(cgen_state_.get(), window_func->get_type_info());
405  auto target_col_null_val_lv =
407  if (window_func_context->elementCount() == 0) {
408  // we do not need to generate a code for an empty input table
409  return target_col_null_val_lv;
410  }
411 
412  auto current_row_pos_lv = code_generator.posArg(nullptr);
413  auto partition_index_lv = codegenCurrentPartitionIndex(
414  window_func_context, &code_generator, co, current_row_pos_lv);
415 
416  // load window function input expression; target_column
417  size_t target_col_size_in_byte = target_col_size * 8;
418  llvm::Type* col_buf_ptr_type =
419  target_col_ti.is_fp()
420  ? get_fp_type(target_col_size_in_byte, cgen_state_->context_)
421  : get_int_type(target_col_size_in_byte, cgen_state_->context_);
422  auto col_buf_type = llvm::PointerType::get(col_buf_ptr_type, 0);
423  auto target_col_buf_ptr_lv = cgen_state_->llInt(reinterpret_cast<int64_t>(
424  window_func_context->getColumnBufferForWindowFunctionExpressions().front()));
425  auto target_col_buf_lv = CodegenUtil::createPtrWithHoistedMemoryAddr(
426  cgen_state_.get(),
427  &code_generator,
428  co,
429  target_col_buf_ptr_lv,
430  col_buf_type,
432  .front();
433 
434  // prepare various buffer ptrs related to the window partition
435  auto partition_buf_ptrs = codegenLoadPartitionBuffers(
436  window_func_context, &code_generator, co, partition_index_lv);
437 
438  // null value of the ordering column
439  const auto order_key_buf_ti =
440  window_func_context->getOrderKeyColumnBufferTypes().front();
441  auto const ordering_spec = window_func->getCollation().front();
442  auto order_key_col_null_val_lv =
443  get_null_value_by_size_with_encoding(cgen_state_.get(), order_key_buf_ti);
444 
445  // load ordering column
446  auto [order_col_type_name, order_key_buf_ptr_lv] =
447  codegenLoadOrderKeyBufPtr(window_func_context, &code_generator, co);
448 
449  // null range
450  auto [null_start_pos_lv, null_end_pos_lv] =
451  codegenFrameNullRange(window_func_context, &code_generator, co, partition_index_lv);
452 
453  // compute a row index of the current row w.r.t the window frame it belongs to
454  std::string row_idx_on_frame_func = "compute_";
455  row_idx_on_frame_func += order_col_type_name;
456  row_idx_on_frame_func += ordering_spec.is_desc ? "_greater_equal" : "_less_equal";
457  row_idx_on_frame_func += "_current_row_idx_in_frame";
458  auto int64_t_one_val_lv = cgen_state_->llInt((int64_t)1);
459  auto nulls_first_lv = cgen_state_->llBool(ordering_spec.nulls_first);
460  auto cur_row_idx_in_frame_lv =
461  cgen_state_->emitCall(row_idx_on_frame_func,
462  {partition_buf_ptrs.num_elem_current_partition_lv,
463  current_row_pos_lv,
464  order_key_buf_ptr_lv,
465  partition_buf_ptrs.target_partition_rowid_ptr_lv,
466  partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
467  order_key_col_null_val_lv,
468  nulls_first_lv,
469  null_start_pos_lv,
470  null_end_pos_lv});
471 
472  if (window_func->isMissingValueFillingFunction()) {
473  // We classify both FORWARD_FILL and BACKWARD_FILL as window frame navigate function
474  // b/c they need to determine the current row index within a sorted partition
475  // (as we did for window frame navigation functions) to compute the correct and
476  // consistent resultset Otherwise, the query result may differ per execution due to
477  // missing table ordering Now we know the current row's index in the sorted
478  // partition (cur_row_idx_in_frame_lv), so we can return by calling the runtime
479  // function with the index we computed
480  std::string func_name = "fill_" + target_col_type_name + "_missing_value";
481 
482  llvm::Value* forward_fill_lv =
483  cgen_state_->llBool(window_func_kind == SqlWindowFunctionKind::FORWARD_FILL);
484  return cgen_state_->emitCall(func_name,
485  {cur_row_idx_in_frame_lv,
486  target_col_null_val_lv,
487  target_col_buf_lv,
488  partition_buf_ptrs.num_elem_current_partition_lv,
489  partition_buf_ptrs.target_partition_rowid_ptr_lv,
490  partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
491  forward_fill_lv});
492  }
493 
494  // compute frame bound for the current row
495  auto [frame_start_bound_expr_lv, frame_end_bound_expr_lv] =
496  codegenFrameBoundRange(window_func, code_generator, co);
497 
498  // compute frame bound for the current row
499  auto const int64_t_zero_val_lv = cgen_state_->llInt((int64_t)0);
501  frame_start_bound_expr_lv,
502  frame_end_bound_expr_lv,
503  window_func->hasRangeModeFraming() ? current_row_pos_lv : cur_row_idx_in_frame_lv,
504  nullptr,
505  window_func->hasRangeModeFraming()
506  ? int64_t_zero_val_lv
507  : partition_buf_ptrs.current_partition_start_offset_lv,
508  int64_t_zero_val_lv,
509  int64_t_one_val_lv,
510  partition_buf_ptrs.num_elem_current_partition_lv,
511  order_key_buf_ptr_lv,
512  "",
513  partition_buf_ptrs.target_partition_rowid_ptr_lv,
514  partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
515  nulls_first_lv,
516  null_start_pos_lv,
517  null_end_pos_lv};
518  auto [frame_start_bound_lv, frame_end_bound_lv] =
519  codegenWindowFrameBounds(window_func_context,
520  window_func->getFrameStartBound(),
521  window_func->getFrameEndBound(),
522  order_key_col_null_val_lv,
524  code_generator);
525 
526  // compute the index of the current row in frame it belongs to
527  llvm::Value* modified_cur_row_idx_in_frame_lv{nullptr};
528  llvm::Value* offset_lv{nullptr};
529  switch (window_func_kind) {
531  offset_lv = cgen_state_->castToTypeIn(
532  code_generator.codegen(args[1].get(), true, co)[0], 64);
533  modified_cur_row_idx_in_frame_lv =
534  cgen_state_->ir_builder_.CreateSub(cur_row_idx_in_frame_lv, offset_lv);
535  break;
537  offset_lv = cgen_state_->castToTypeIn(
538  code_generator.codegen(args[1].get(), true, co)[0], 64);
539  modified_cur_row_idx_in_frame_lv =
540  cgen_state_->ir_builder_.CreateAdd(cur_row_idx_in_frame_lv, offset_lv);
541  break;
543  modified_cur_row_idx_in_frame_lv = frame_start_bound_lv;
544  break;
546  modified_cur_row_idx_in_frame_lv = frame_end_bound_lv;
547  break;
549  offset_lv = cgen_state_->castToTypeIn(
550  code_generator.codegen(args[1].get(), true, co)[0], 64);
551  auto candidate_offset_lv =
552  cgen_state_->ir_builder_.CreateAdd(frame_start_bound_lv, offset_lv);
553  auto out_of_frame_bound_lv =
554  cgen_state_->ir_builder_.CreateICmpSGT(candidate_offset_lv, frame_end_bound_lv);
555  modified_cur_row_idx_in_frame_lv = cgen_state_->ir_builder_.CreateSelect(
556  out_of_frame_bound_lv, cgen_state_->llInt((int64_t)-1), candidate_offset_lv);
557  break;
558  }
559  default:
560  UNREACHABLE() << "Unsupported window function to navigate a window frame.";
561  }
562  CHECK(modified_cur_row_idx_in_frame_lv);
563 
564  // get the target column value in the frame w.r.t the offset
565  std::string target_func_name = "get_";
566  target_func_name += target_col_type_name + "_value_";
567  target_func_name += target_col_logical_type_name + "_type_";
568  target_func_name += "in_frame";
569  auto res_lv =
570  cgen_state_->emitCall(target_func_name,
571  {modified_cur_row_idx_in_frame_lv,
572  frame_start_bound_lv,
573  frame_end_bound_lv,
574  target_col_buf_lv,
575  partition_buf_ptrs.target_partition_rowid_ptr_lv,
576  partition_buf_ptrs.target_partition_sorted_rowid_ptr_lv,
577  logical_null_val_lv,
578  target_col_null_val_lv});
579  if (target_col_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
580  res_lv = cgen_state_->emitCall(
581  "encode_date",
582  {res_lv, logical_null_val_lv, cgen_state_->llInt((int64_t)kSecsPerDay)});
583  }
584  CHECK(res_lv);
585  return res_lv;
586 }
std::string get_col_type_name_by_size(const size_t size, const bool is_fp)
static constexpr int64_t kSecsPerDay
#define UNREACHABLE()
Definition: Logger.h:338
llvm::Type * get_fp_type(const int width, llvm::LLVMContext &context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::pair< llvm::Value *, llvm::Value * > codegenFrameNullRange(WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *partition_index_lv) const
std::pair< llvm::Value *, llvm::Value * > codegenWindowFrameBounds(WindowFunctionContext *window_func_context, const Analyzer::WindowFrame *frame_start_bound, const Analyzer::WindowFrame *frame_end_bound, llvm::Value *order_key_col_null_val_lv, WindowFrameBoundFuncArgs &args, CodeGenerator &code_generator)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static const int NUM_EXECUTION_DEVICES
llvm::Value * current_partition_start_offset_lv
Definition: WindowContext.h:92
std::pair< llvm::Value *, llvm::Value * > codegenFrameBoundRange(const Analyzer::WindowFunction *window_func, CodeGenerator &code_generator, const CompilationOptions &co)
std::unique_ptr< llvm::LLVMContext > context_
Definition: Execute.h:1477
llvm::Value * codegenCurrentPartitionIndex(const WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *current_row_pos_lv)
#define CHECK(condition)
Definition: Logger.h:291
WindowPartitionBufferPtrs codegenLoadPartitionBuffers(WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co, llvm::Value *partition_index_lv) const
std::pair< std::string, llvm::Value * > codegenLoadOrderKeyBufPtr(WindowFunctionContext *window_func_context, CodeGenerator *code_generator, const CompilationOptions &co) const
llvm::Value * get_null_value_by_size(CgenState *cgen_state, SQLTypeInfo col_ti)
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)
llvm::Value * get_null_value_by_size_with_encoding(CgenState *cgen_state, SQLTypeInfo col_ti)

+ Here is the call graph for this function:

std::pair< llvm::BasicBlock *, llvm::Value * > Executor::codegenWindowResetStateControlFlow ( CodeGenerator code_generator,
const CompilationOptions co 
)
private

Definition at line 289 of file WindowFunctionIR.cpp.

References AUTOMATIC_IR_METADATA, CodegenUtil::createPtrWithHoistedMemoryAddr(), get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), WindowFunctionContext::NUM_EXECUTION_DEVICES, CodeGenerator::posArg(), and CodeGenerator::toBool().

291  {
293  const auto window_func_context =
295  auto aggregate_state = aggregateWindowStatePtr(code_generator, co);
296  const auto bitset = cgen_state_->llInt(
297  reinterpret_cast<const int64_t>(window_func_context->partitionStart()));
298  const auto bitset_lv =
300  cgen_state_.get(),
301  code_generator,
302  co,
303  bitset,
304  llvm::PointerType::get(get_int_type(8, cgen_state_->context_), 0),
306  .front();
307  const auto min_val = cgen_state_->llInt(int64_t(0));
308  const auto max_val = cgen_state_->llInt(window_func_context->elementCount() - 1);
309  const auto null_val = cgen_state_->llInt(inline_int_null_value<int64_t>());
310  const auto null_bool_val = cgen_state_->llInt<int8_t>(inline_int_null_value<int8_t>());
311  const auto reset_state =
312  code_generator->toBool(cgen_state_->emitCall("bit_is_set",
313  {bitset_lv,
314  code_generator->posArg(nullptr),
315  min_val,
316  max_val,
317  null_val,
318  null_bool_val}));
319  const auto reset_state_true_bb = llvm::BasicBlock::Create(
320  cgen_state_->context_, "reset_state.true", cgen_state_->current_func_);
321  const auto reset_state_false_bb = llvm::BasicBlock::Create(
322  cgen_state_->context_, "reset_state.false", cgen_state_->current_func_);
323  cgen_state_->ir_builder_.CreateCondBr(
324  reset_state, reset_state_true_bb, reset_state_false_bb);
325  cgen_state_->ir_builder_.SetInsertPoint(reset_state_true_bb);
326  return std::make_pair(reset_state_false_bb, aggregate_state);
327 }
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:590
llvm::Value * aggregateWindowStatePtr(CodeGenerator *code_generator, const CompilationOptions &co)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static const int NUM_EXECUTION_DEVICES
llvm::Value * toBool(llvm::Value *)
Definition: LogicalIR.cpp:343
std::vector< llvm::Value * > createPtrWithHoistedMemoryAddr(CgenState *cgen_state, CodeGenerator *code_generator, CompilationOptions const &co, llvm::ConstantInt *ptr_int_val, llvm::Type *type, size_t num_devices_to_hoist_literal)

+ Here is the call graph for this function:

ResultSetPtr Executor::collectAllDeviceResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner 
)
private

Definition at line 2682 of file Execute.cpp.

References anonymous_namespace{Execute.cpp}::build_row_for_empty_input(), collectAllDeviceShardedTopResults(), DEBUG_TIMER, SharedKernelContext::getFragmentResults(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, NonGroupedAggregate, reduceMultiDeviceResults(), reduceSpeculativeTopN(), GroupByAndAggregate::shard_count_for_top_groups(), RelAlgExecutionUnit::target_exprs, and use_speculative_top_n().

Referenced by executeWorkUnitImpl().

2687  {
2688  auto timer = DEBUG_TIMER(__func__);
2689  auto& result_per_device = shared_context.getFragmentResults();
2690  if (result_per_device.empty() && query_mem_desc.getQueryDescriptionType() ==
2693  ra_exe_unit.target_exprs, query_mem_desc, device_type);
2694  }
2695  if (use_speculative_top_n(ra_exe_unit, query_mem_desc)) {
2696  try {
2697  return reduceSpeculativeTopN(
2698  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
2699  } catch (const std::bad_alloc&) {
2700  throw SpeculativeTopNFailed("Failed during multi-device reduction.");
2701  }
2702  }
2703  const auto shard_count =
2704  device_type == ExecutorDeviceType::GPU
2706  : 0;
2707 
2708  if (shard_count && !result_per_device.empty()) {
2709  return collectAllDeviceShardedTopResults(shared_context, ra_exe_unit, device_type);
2710  }
2711  return reduceMultiDeviceResults(
2712  ra_exe_unit, result_per_device, row_set_mem_owner, query_mem_desc);
2713 }
std::vector< Analyzer::Expr * > target_exprs
bool use_speculative_top_n(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc)
ResultSetPtr collectAllDeviceShardedTopResults(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type) const
Definition: Execute.cpp:2797
ResultSetPtr reduceSpeculativeTopN(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:1699
ResultSetPtr reduceMultiDeviceResults(const RelAlgExecutionUnit &, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:1564
QueryDescriptionType getQueryDescriptionType() const
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit)
ResultSetPtr build_row_for_empty_input(const std::vector< Analyzer::Expr * > &target_exprs_in, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
Definition: Execute.cpp:2641
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define DEBUG_TIMER(name)
Definition: Logger.h:412

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ResultSetPtr Executor::collectAllDeviceShardedTopResults ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
const ExecutorDeviceType  device_type 
) const
private

Definition at line 2797 of file Execute.cpp.

References blockSize(), CHECK, CHECK_EQ, CHECK_LE, SharedKernelContext::getFragmentResults(), gridSize(), SortInfo::limit, SortInfo::offset, SortInfo::order_entries, anonymous_namespace{Execute.cpp}::permute_storage_columnar(), anonymous_namespace{Execute.cpp}::permute_storage_row_wise(), run_benchmark_import::result, and RelAlgExecutionUnit::sort_info.

Referenced by collectAllDeviceResults().

2800  {
2801  auto& result_per_device = shared_context.getFragmentResults();
2802  const auto first_result_set = result_per_device.front().first;
2803  CHECK(first_result_set);
2804  auto top_query_mem_desc = first_result_set->getQueryMemDesc();
2805  CHECK(!top_query_mem_desc.hasInterleavedBinsOnGpu());
2806  const auto top_n =
2807  ra_exe_unit.sort_info.limit.value_or(0) + ra_exe_unit.sort_info.offset;
2808  top_query_mem_desc.setEntryCount(0);
2809  for (auto& result : result_per_device) {
2810  const auto result_set = result.first;
2811  CHECK(result_set);
2812  result_set->sort(ra_exe_unit.sort_info.order_entries, top_n, device_type, this);
2813  size_t new_entry_cnt = top_query_mem_desc.getEntryCount() + result_set->rowCount();
2814  top_query_mem_desc.setEntryCount(new_entry_cnt);
2815  }
2816  auto top_result_set = std::make_shared<ResultSet>(first_result_set->getTargetInfos(),
2817  first_result_set->getDeviceType(),
2818  top_query_mem_desc,
2819  first_result_set->getRowSetMemOwner(),
2820  blockSize(),
2821  gridSize());
2822  auto top_storage = top_result_set->allocateStorage();
2823  size_t top_output_row_idx{0};
2824  for (auto& result : result_per_device) {
2825  const auto result_set = result.first;
2826  CHECK(result_set);
2827  const auto& top_permutation = result_set->getPermutationBuffer();
2828  CHECK_LE(top_permutation.size(), top_n);
2829  if (top_query_mem_desc.didOutputColumnar()) {
2830  top_output_row_idx = permute_storage_columnar(result_set->getStorage(),
2831  result_set->getQueryMemDesc(),
2832  top_storage,
2833  top_output_row_idx,
2834  top_query_mem_desc,
2835  top_permutation);
2836  } else {
2837  top_output_row_idx = permute_storage_row_wise(result_set->getStorage(),
2838  top_storage,
2839  top_output_row_idx,
2840  top_query_mem_desc,
2841  top_permutation);
2842  }
2843  }
2844  CHECK_EQ(top_output_row_idx, top_query_mem_desc.getEntryCount());
2845  return top_result_set;
2846 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t permute_storage_row_wise(const ResultSetStorage *input_storage, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:2776
std::optional< size_t > limit
std::list< Analyzer::OrderEntry > order_entries
#define CHECK_LE(x, y)
Definition: Logger.h:304
unsigned gridSize() const
Definition: Execute.cpp:4318
size_t permute_storage_columnar(const ResultSetStorage *input_storage, const QueryMemoryDescriptor &input_query_mem_desc, const ResultSetStorage *output_storage, size_t output_row_index, const QueryMemoryDescriptor &output_query_mem_desc, const std::vector< uint32_t > &top_permutation)
Definition: Execute.cpp:2726
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define CHECK(condition)
Definition: Logger.h:291
unsigned blockSize() const
Definition: Execute.cpp:4332

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool Executor::compileBody ( const RelAlgExecutionUnit ra_exe_unit,
GroupByAndAggregate group_by_and_aggregate,
QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const GpuSharedMemoryContext gpu_smem_context = {} 
)
private

Definition at line 3348 of file NativeCodegen.cpp.

3352  {
3354 
3355  // Switch the code generation into a separate filter function if enabled.
3356  // Note that accesses to function arguments are still codegenned from the
3357  // row function's arguments, then later automatically forwarded and
3358  // remapped into filter function arguments by redeclareFilterFunction().
3359  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3360  llvm::Value* loop_done{nullptr};
3361  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3362  if (cgen_state_->filter_func_) {
3363  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3364  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3365  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3366  row_func_entry_bb->begin());
3367  loop_done = cgen_state_->ir_builder_.CreateAlloca(
3368  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
3369  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3370  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
3371  }
3372  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3373  cgen_state_->current_func_ = cgen_state_->filter_func_;
3374  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3375  }
3376 
3377  // generate the code for the filter
3378  std::vector<Analyzer::Expr*> primary_quals;
3379  std::vector<Analyzer::Expr*> deferred_quals;
3380  bool short_circuited = CodeGenerator::prioritizeQuals(
3381  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3382  if (short_circuited) {
3383  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
3384  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
3385  << " quals";
3386  }
3387  llvm::Value* filter_lv = cgen_state_->llBool(true);
3388  CodeGenerator code_generator(this);
3389  for (auto expr : primary_quals) {
3390  // Generate the filter for primary quals
3391  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
3392  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3393  }
3394  CHECK(filter_lv->getType()->isIntegerTy(1));
3395  llvm::BasicBlock* sc_false{nullptr};
3396  if (!deferred_quals.empty()) {
3397  auto sc_true = llvm::BasicBlock::Create(
3398  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
3399  sc_false = llvm::BasicBlock::Create(
3400  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
3401  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3402  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3403  if (ra_exe_unit.join_quals.empty()) {
3404  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3405  }
3406  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3407  filter_lv = cgen_state_->llBool(true);
3408  }
3409  for (auto expr : deferred_quals) {
3410  filter_lv = cgen_state_->ir_builder_.CreateAnd(
3411  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
3412  }
3413 
3414  CHECK(filter_lv->getType()->isIntegerTy(1));
3415  auto ret = group_by_and_aggregate.codegen(
3416  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3417 
3418  // Switch the code generation back to the row function if a filter
3419  // function was enabled.
3420  if (cgen_state_->filter_func_) {
3421  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3422  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
3423  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3424  }
3425 
3426  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3427  cgen_state_->current_func_ = cgen_state_->row_func_;
3428  cgen_state_->filter_func_call_ =
3429  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3430 
3431  // Create real filter function declaration after placeholder call
3432  // is emitted.
3434 
3435  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3436  auto loop_done_true = llvm::BasicBlock::Create(
3437  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
3438  auto loop_done_false = llvm::BasicBlock::Create(
3439  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
3440  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3441  loop_done->getType()->getPointerElementType(), loop_done);
3442  cgen_state_->ir_builder_.CreateCondBr(
3443  loop_done_flag, loop_done_true, loop_done_false);
3444  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3445  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3446  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3447  } else {
3448  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3449  }
3450  }
3451  return ret;
3452 }
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define AUTOMATIC_IR_METADATA(CGENSTATE)
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
#define CHECK(condition)
Definition: Logger.h:291
void redeclareFilterFunction()
Definition: IRCodegen.cpp:1086
#define VLOG(n)
Definition: Logger.h:388
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > Executor::compileWorkUnit ( const std::vector< InputTableInfo > &  query_infos,
const PlanState::DeletedColumnsMap deleted_cols_map,
const RelAlgExecutionUnit ra_exe_unit,
const CompilationOptions co,
const ExecutionOptions eo,
const CudaMgr_Namespace::CudaMgr cuda_mgr,
const bool  allow_lazy_fetch,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const size_t  max_groups_buffer_entry_count,
const int8_t  crt_min_byte_width,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache,
RenderInfo render_info = nullptr 
)
private

Definition at line 2817 of file NativeCodegen.cpp.

2829  {
2830  auto timer = DEBUG_TIMER(__func__);
2831 
2833  if (!cuda_mgr) {
2834  throw QueryMustRunOnCpu();
2835  }
2836  }
2837 
2838 #ifndef NDEBUG
2839  static std::uint64_t counter = 0;
2840  ++counter;
2841  VLOG(1) << "CODEGEN #" << counter << ":";
2842  LOG(IR) << "CODEGEN #" << counter << ":";
2843  LOG(PTX) << "CODEGEN #" << counter << ":";
2844  LOG(ASM) << "CODEGEN #" << counter << ":";
2845 #endif
2846 
2847  // cgenstate_manager uses RAII pattern to manage the live time of
2848  // CgenState instances.
2849  Executor::CgenStateManager cgenstate_manager(*this,
2850  allow_lazy_fetch,
2851  query_infos,
2852  deleted_cols_map,
2853  &ra_exe_unit); // locks compilation_mutex
2854  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2855 
2856  GroupByAndAggregate group_by_and_aggregate(
2857  this,
2858  co.device_type,
2859  ra_exe_unit,
2860  query_infos,
2861  row_set_mem_owner,
2862  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2863  : std::nullopt);
2864  auto query_mem_desc =
2865  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2866  max_groups_buffer_entry_guess,
2867  crt_min_byte_width,
2868  render_info,
2870 
2871  if (query_mem_desc->getQueryDescriptionType() ==
2873  !has_cardinality_estimation && (!render_info || !render_info->isInSitu()) &&
2874  !eo.just_explain) {
2875  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2876  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2877  }
2878 
2879  const bool output_columnar = query_mem_desc->didOutputColumnar();
2880  const bool gpu_shared_mem_optimization =
2882  ra_exe_unit,
2883  cuda_mgr,
2884  co.device_type,
2885  cuda_mgr ? this->blockSize() : 1,
2886  cuda_mgr ? this->numBlocksPerMP() : 1);
2887  if (gpu_shared_mem_optimization) {
2888  // disable interleaved bins optimization on the GPU
2889  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2890  LOG(DEBUG1) << "GPU shared memory is used for the " +
2891  query_mem_desc->queryDescTypeToString() + " query(" +
2892  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2893  query_mem_desc.get())) +
2894  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2895  }
2896 
2897  const GpuSharedMemoryContext gpu_smem_context(
2898  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2899 
2901  const size_t num_count_distinct_descs =
2902  query_mem_desc->getCountDistinctDescriptorsSize();
2903  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2904  const auto& count_distinct_descriptor =
2905  query_mem_desc->getCountDistinctDescriptor(i);
2906  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::UnorderedSet ||
2907  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2908  !co.hoist_literals)) {
2909  throw QueryMustRunOnCpu();
2910  }
2911  }
2912 
2913  // we currently do not support varlen projection based on baseline groupby when
2914  // 1) target table is multi-fragmented and 2) multiple gpus are involved for query
2915  // processing in this case, we punt the query to cpu to avoid server crash
2916  for (const auto expr : ra_exe_unit.target_exprs) {
2917  if (auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2918  bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 : false;
2919  if (gby_expr->get_aggtype() == SQLAgg::kSAMPLE && has_multiple_gpus &&
2920  !g_leaf_count) {
2921  std::set<const Analyzer::ColumnVar*,
2922  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
2924  gby_expr->collect_column_var(colvar_set, true);
2925  for (const auto cv : colvar_set) {
2926  if (cv->get_type_info().is_varlen()) {
2927  const auto tbl_key = cv->getTableKey();
2928  std::for_each(query_infos.begin(),
2929  query_infos.end(),
2930  [&tbl_key](const InputTableInfo& input_table_info) {
2931  if (input_table_info.table_key == tbl_key &&
2932  input_table_info.info.fragments.size() > 1) {
2933  throw QueryMustRunOnCpu();
2934  }
2935  });
2936  }
2937  }
2938  }
2939  }
2940  }
2941  }
2942 
2943  // Read the module template and target either CPU or GPU
2944  // by binding the stream position functions to the right implementation:
2945  // stride access for GPU, contiguous for CPU
2946  CHECK(cgen_state_->module_ == nullptr);
2947  cgen_state_->set_module_shallow_copy(get_rt_module(), /*always_clone=*/true);
2948 
2949  auto is_gpu = co.device_type == ExecutorDeviceType::GPU;
2950  if (is_gpu) {
2951  cgen_state_->module_->setDataLayout(get_gpu_data_layout());
2952  cgen_state_->module_->setTargetTriple(get_gpu_target_triple_string());
2953  }
2954  if (has_udf_module(/*is_gpu=*/is_gpu)) {
2956  get_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
2957  }
2958  if (has_rt_udf_module(/*is_gpu=*/is_gpu)) {
2960  get_rt_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
2961  }
2962 
2964 
2965  auto agg_fnames =
2966  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2967 
2968  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2969 
2970  const bool is_group_by{query_mem_desc->isGroupBy()};
2971  auto [query_func, row_func_call] = is_group_by
2973  co.hoist_literals,
2974  *query_mem_desc,
2975  co.device_type,
2976  ra_exe_unit.scan_limit,
2977  gpu_smem_context)
2978  : query_template(cgen_state_->module_,
2979  agg_slot_count,
2980  co.hoist_literals,
2981  !!ra_exe_unit.estimator,
2982  gpu_smem_context);
2983  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2984  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2985  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2986 
2987  cgen_state_->query_func_ = query_func;
2988  cgen_state_->row_func_call_ = row_func_call;
2989  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2990  &query_func->getEntryBlock().front());
2991 
2992  // Generate the function signature and column head fetches s.t.
2993  // double indirection isn't needed in the inner loop
2994  auto& fetch_bb = query_func->front();
2995  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2996  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2997  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2998  get_arg_by_name(query_func, "byte_stream"),
2999  fetch_ir_builder,
3000  cgen_state_->context_);
3001  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
3002 
3003  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
3004  is_group_by ? 0 : agg_slot_count,
3005  co.hoist_literals,
3006  cgen_state_->module_,
3007  cgen_state_->context_);
3008  CHECK(cgen_state_->row_func_);
3009  cgen_state_->row_func_bb_ =
3010  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
3011 
3013  auto filter_func_ft =
3014  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
3015  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
3016  llvm::Function::ExternalLinkage,
3017  "filter_func",
3018  cgen_state_->module_);
3019  CHECK(cgen_state_->filter_func_);
3020  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
3021  cgen_state_->context_, "entry", cgen_state_->filter_func_);
3022  }
3023 
3024  cgen_state_->current_func_ = cgen_state_->row_func_;
3025  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3026 
3027  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
3028  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
3029  const auto join_loops =
3030  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
3031 
3032  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
3033  for (auto& simple_qual : ra_exe_unit.simple_quals) {
3034  plan_state_->addSimpleQual(simple_qual);
3035  }
3036  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
3037  if (is_not_deleted_bb) {
3038  cgen_state_->row_func_bb_ = is_not_deleted_bb;
3039  }
3040  if (!join_loops.empty()) {
3041  codegenJoinLoops(join_loops,
3042  body_execution_unit,
3043  group_by_and_aggregate,
3044  query_func,
3045  cgen_state_->row_func_bb_,
3046  *(query_mem_desc.get()),
3047  co,
3048  eo);
3049  } else {
3050  const bool can_return_error = compileBody(
3051  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
3052  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
3054  createErrorCheckControlFlow(query_func,
3057  join_loops,
3058  co.device_type,
3059  group_by_and_aggregate.query_infos_);
3060  }
3061  }
3062  std::vector<llvm::Value*> hoisted_literals;
3063 
3064  if (co.hoist_literals) {
3065  VLOG(1) << "number of hoisted literals: "
3066  << cgen_state_->query_func_literal_loads_.size()
3067  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
3068  << " bytes";
3069  }
3070 
3071  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
3072  // we have some hoisted literals...
3073  hoisted_literals = inlineHoistedLiterals();
3074  }
3075 
3076  // replace the row func placeholder call with the call to the actual row func
3077  std::vector<llvm::Value*> row_func_args;
3078  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
3079  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
3080  }
3081  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
3082  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
3083  row_func_args.push_back(get_arg_by_name(query_func, "row_func_mgr"));
3084  // push hoisted literals arguments, if any
3085  row_func_args.insert(
3086  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
3087  llvm::ReplaceInstWithInst(
3088  cgen_state_->row_func_call_,
3089  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
3090 
3091  // replace the filter func placeholder call with the call to the actual filter func
3092  if (cgen_state_->filter_func_) {
3093  std::vector<llvm::Value*> filter_func_args;
3094  for (auto arg_it = cgen_state_->filter_func_args_.begin();
3095  arg_it != cgen_state_->filter_func_args_.end();
3096  ++arg_it) {
3097  filter_func_args.push_back(arg_it->first);
3098  }
3099  llvm::ReplaceInstWithInst(
3100  cgen_state_->filter_func_call_,
3101  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
3102  }
3103 
3104  // Aggregate
3105  plan_state_->init_agg_vals_ =
3106  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
3107 
3108  /*
3109  * If we have decided to use GPU shared memory (decision is not made here), then
3110  * we generate proper code for extra components that it needs (buffer initialization and
3111  * gpu reduction from shared memory to global memory). We then replace these functions
3112  * into the already compiled query_func (replacing two placeholders, write_back_nop and
3113  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
3114  */
3115  if (gpu_smem_context.isSharedMemoryUsed()) {
3116  if (query_mem_desc->getQueryDescriptionType() ==
3118  GpuSharedMemCodeBuilder gpu_smem_code(
3119  cgen_state_->module_,
3120  cgen_state_->context_,
3121  *query_mem_desc,
3123  plan_state_->init_agg_vals_,
3124  executor_id_);
3125  gpu_smem_code.codegen();
3126  gpu_smem_code.injectFunctionsInto(query_func);
3127 
3128  // helper functions are used for caching purposes later
3129  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
3130  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
3131  LOG(IR) << gpu_smem_code.toString();
3132  }
3133  }
3134 
3135  auto multifrag_query_func = cgen_state_->module_->getFunction(
3136  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
3137  CHECK(multifrag_query_func);
3138 
3140  insertErrorCodeChecker(multifrag_query_func,
3141  get_index_by_name(query_func, "error_code"),
3142  co.hoist_literals,
3144  }
3145 
3146  bind_query(query_func,
3147  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
3148  multifrag_query_func,
3149  cgen_state_->module_);
3150 
3151  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
3152  if (cgen_state_->filter_func_) {
3153  root_funcs.push_back(cgen_state_->filter_func_);
3154  }
3155  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
3156  *cgen_state_->module_, root_funcs, {multifrag_query_func});
3157 
3158  // Always inline the row function and the filter function.
3159  // We don't want register spills in the inner loops.
3160  // LLVM seems to correctly free up alloca instructions
3161  // in these functions even when they are inlined.
3163  if (cgen_state_->filter_func_) {
3165  }
3166 
3167 #ifndef NDEBUG
3168  // Add helpful metadata to the LLVM IR for debugging.
3170 #endif
3171 
3172  auto const device_str = co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n";
3173  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
3174  std::string llvm_ir =
3175  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
3176  serialize_llvm_object(cgen_state_->row_func_) +
3177  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_) : "");
3178  VLOG(3) << "Unoptimized IR for the " << device_str << "\n" << llvm_ir << "\nEnd of IR";
3180 #ifdef WITH_JIT_DEBUG
3181  throw std::runtime_error(
3182  "Explain optimized not available when JIT runtime debug symbols are enabled");
3183 #else
3184  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
3185  // optimized IR after NVVM reflect
3186  llvm::legacy::PassManager pass_manager;
3187  optimize_ir(query_func,
3188  cgen_state_->module_,
3189  pass_manager,
3190  live_funcs,
3191  gpu_smem_context.isSharedMemoryUsed(),
3192  co);
3193 #endif // WITH_JIT_DEBUG
3194  llvm_ir =
3195  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
3196  serialize_llvm_object(cgen_state_->row_func_) +
3197  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
3198  : "");
3199 #ifndef NDEBUG
3200  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
3201 #endif
3202  }
3203  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
3204  LOG(IR) << "IR for the " << device_str;
3205 #ifdef NDEBUG
3206  LOG(IR) << serialize_llvm_object(query_func)
3207  << serialize_llvm_object(cgen_state_->row_func_)
3208  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
3209  : "")
3210  << "\nEnd of IR";
3211 #else
3212  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
3213 #endif
3214  // Insert calls to "register_buffer_with_executor_rsm" for allocations
3215  // in runtime functions (i.e. from RBC) without it
3217 
3218  // Run some basic validation checks on the LLVM IR before code is generated below.
3219  verify_function_ir(cgen_state_->row_func_);
3220  if (cgen_state_->filter_func_) {
3221  verify_function_ir(cgen_state_->filter_func_);
3222  }
3223 
3224  // Generate final native code from the LLVM IR.
3225  return std::make_tuple(
3228  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
3229  : optimizeAndCodegenGPU(query_func,
3230  multifrag_query_func,
3231  live_funcs,
3232  is_group_by || ra_exe_unit.estimator,
3233  cuda_mgr,
3234  gpu_smem_context.isSharedMemoryUsed(),
3235  co),
3236  cgen_state_->getLiterals(),
3237  output_columnar,
3238  llvm_ir,
3239  std::move(gpu_smem_context)},
3240  std::move(query_mem_desc));
3241 }
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void codegenJoinLoops(const std::vector< JoinLoop > &join_loops, const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, llvm::Function *query_func, llvm::BasicBlock *entry_bb, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const ExecutionOptions &eo)
Definition: IRCodegen.cpp:1203
const std::unique_ptr< llvm::Module > & get_udf_module(bool is_gpu=false) const
Definition: Execute.h:535
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned cuda_blocksize, const unsigned num_blocks_per_mp)
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
#define LOG(tag)
Definition: Logger.h:285
void AutoTrackBuffersInRuntimeIR()
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
const std::unique_ptr< llvm::Module > & get_rt_udf_module(bool is_gpu=false) const
Definition: Execute.h:539
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
void insertErrorCodeChecker(llvm::Function *query_func, unsigned const error_code_idx, bool hoist_literals, bool allow_runtime_query_interrupt)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
unsigned numBlocksPerMP() const
Definition: Execute.cpp:4327
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
void addTransientStringLiterals(const RelAlgExecutionUnit &ra_exe_unit, const std::shared_ptr< RowSetMemoryOwner > &row_set_mem_owner)
Definition: Execute.cpp:2494
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
void preloadFragOffsets(const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos)
Definition: Execute.cpp:4254
const ExecutorId executor_id_
Definition: Execute.h:1476
llvm::StringRef get_gpu_target_triple_string()
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
ExecutorExplainType explain_type
unsigned get_index_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:187
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const std::unique_ptr< llvm::Module > & get_rt_module() const
Definition: Execute.h:532
this
Definition: Execute.cpp:281
#define AUTOMATIC_IR_METADATA_DONE()
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
std::string serialize_llvm_object(const T *llvm_obj)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
bool has_udf_module(bool is_gpu=false) const
Definition: Execute.h:555
bool g_enable_filter_function
Definition: Execute.cpp:87
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
bool has_rt_udf_module(bool is_gpu=false) const
Definition: Execute.h:559
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
std::vector< JoinLoop > buildJoinLoops(RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const std::vector< InputTableInfo > &query_infos, ColumnCacheMap &column_cache)
Definition: IRCodegen.cpp:610
unsigned blockSize() const
Definition: Execute.cpp:4332
size_t g_leaf_count
Definition: ParserNode.cpp:78
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
#define VLOG(n)
Definition: Logger.h:388
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
size_t g_gpu_smem_threshold
Definition: Execute.cpp:138
AggregatedColRange Executor::computeColRangesCache ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 4860 of file Execute.cpp.

References CHECK, Catalog_Namespace::get_metadata_for_column(), getLeafColumnRange(), getTableInfo(), AggregatedColRange::setColRange(), and ExpressionRange::typeSupportsRange().

Referenced by setupCaching().

4861  {
4862  AggregatedColRange agg_col_range_cache;
4863  std::unordered_set<shared::TableKey> phys_table_keys;
4864  for (const auto& phys_input : phys_inputs) {
4865  phys_table_keys.emplace(phys_input.db_id, phys_input.table_id);
4866  }
4867  std::vector<InputTableInfo> query_infos;
4868  for (const auto& table_key : phys_table_keys) {
4869  query_infos.emplace_back(InputTableInfo{table_key, getTableInfo(table_key)});
4870  }
4871  for (const auto& phys_input : phys_inputs) {
4872  auto db_id = phys_input.db_id;
4873  auto table_id = phys_input.table_id;
4874  auto column_id = phys_input.col_id;
4875  const auto cd =
4876  Catalog_Namespace::get_metadata_for_column({db_id, table_id, column_id});
4877  CHECK(cd);
4878  if (ExpressionRange::typeSupportsRange(cd->columnType)) {
4879  const auto col_var = std::make_unique<Analyzer::ColumnVar>(
4880  cd->columnType, shared::ColumnKey{db_id, table_id, column_id}, 0);
4881  const auto col_range = getLeafColumnRange(col_var.get(), query_infos, this, false);
4882  agg_col_range_cache.setColRange(phys_input, col_range);
4883  }
4884  }
4885  return agg_col_range_cache;
4886 }
const ColumnDescriptor * get_metadata_for_column(const ::shared::ColumnKey &column_key)
Fragmenter_Namespace::TableInfo getTableInfo(const shared::TableKey &table_key) const
Definition: Execute.cpp:711
ExpressionRange getLeafColumnRange(const Analyzer::ColumnVar *col_expr, const std::vector< InputTableInfo > &query_infos, const Executor *executor, const bool is_outer_join_proj)
#define CHECK(condition)
Definition: Logger.h:291
void setColRange(const PhysicalInput &, const ExpressionRange &)
static bool typeSupportsRange(const SQLTypeInfo &ti)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

StringDictionaryGenerations Executor::computeStringDictionaryGenerations ( const std::unordered_set< PhysicalInput > &  phys_inputs)
private

Definition at line 4888 of file Execute.cpp.

References CHECK, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), kENCODING_DICT, anonymous_namespace{Execute.cpp}::prepare_string_dictionaries(), and StringDictionaryGenerations::setGeneration().

Referenced by setupCaching().

4889  {
4890  StringDictionaryGenerations string_dictionary_generations;
4891  // Foreign tables may have not populated dictionaries for encoded columns. If this is
4892  // the case then we need to populate them here to make sure that the generations are set
4893  // correctly.
4894  prepare_string_dictionaries(phys_inputs);
4895  for (const auto& phys_input : phys_inputs) {
4896  const auto catalog =
4898  CHECK(catalog);
4899  const auto cd = catalog->getMetadataForColumn(phys_input.table_id, phys_input.col_id);
4900  CHECK(cd);
4901  const auto& col_ti =
4902  cd->columnType.is_array() ? cd->columnType.get_elem_type() : cd->columnType;
4903  if (col_ti.is_string() && col_ti.get_compression() == kENCODING_DICT) {
4904  const auto& dict_key = col_ti.getStringDictKey();
4905  const auto dd = catalog->getMetadataForDict(dict_key.dict_id);
4906  CHECK(dd && dd->stringDict);
4907  string_dictionary_generations.setGeneration(dict_key,
4908  dd->stringDict->storageEntryCount());
4909  }
4910  }
4911  return string_dictionary_generations;
4912 }
void setGeneration(const shared::StringDictKey &dict_key, const uint64_t generation)
void prepare_string_dictionaries(const std::unordered_set< PhysicalInput > &phys_inputs)
Definition: Execute.cpp:217
static SysCatalog & instance()
Definition: SysCatalog.h:343
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

TableGenerations Executor::computeTableGenerations ( const std::unordered_set< shared::TableKey > &  phys_table_keys)
private

Definition at line 4914 of file Execute.cpp.

References getTableInfo(), and TableGenerations::setGeneration().

Referenced by setupCaching().

4915  {
4916  TableGenerations table_generations;
4917  for (const auto& table_key : phys_table_keys) {
4918  const auto table_info = getTableInfo(table_key);
4919  table_generations.setGeneration(
4920  table_key,
4921  TableGeneration{static_cast<int64_t>(table_info.getPhysicalNumTuples()), 0});
4922  }
4923  return table_generations;
4924 }
void setGeneration(const shared::TableKey &table_key, const TableGeneration &generation)
Fragmenter_Namespace::TableInfo getTableInfo(const shared::TableKey &table_key) const
Definition: Execute.cpp:711

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool Executor::containsLeftDeepOuterJoin ( ) const
inline

Definition at line 614 of file Execute.h.

References cgen_state_.

614  {
615  return cgen_state_->contains_left_deep_outer_join_;
616  }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
void Executor::createErrorCheckControlFlow ( llvm::Function *  query_func,
bool  run_with_dynamic_watchdog,
bool  run_with_allowing_runtime_interrupt,
const std::vector< JoinLoop > &  join_loops,
ExecutorDeviceType  device_type,
const std::vector< InputTableInfo > &  input_table_infos 
)
private

Definition at line 2033 of file NativeCodegen.cpp.

2039  {
2041 
2042  // check whether the row processing was successful; currently, it can
2043  // fail by running out of group by buffer slots
2044 
2045  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
2046  // when both dynamic watchdog and runtime interrupt turns on
2047  // we use dynamic watchdog
2048  run_with_allowing_runtime_interrupt = false;
2049  }
2050 
2051  {
2052  // disable injecting query interrupt checker if the session info is invalid
2055  if (current_query_session_.empty()) {
2056  run_with_allowing_runtime_interrupt = false;
2057  }
2058  }
2059 
2060  llvm::Value* row_count = nullptr;
2061  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
2062  device_type == ExecutorDeviceType::GPU) {
2063  row_count =
2064  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
2065  }
2066 
2067  bool done_splitting = false;
2068  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
2069  ++bb_it) {
2070  llvm::Value* pos = nullptr;
2071  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2072  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
2073  llvm::isa<llvm::PHINode>(*inst_it)) {
2074  if (inst_it->getName() == "pos") {
2075  pos = &*inst_it;
2076  }
2077  continue;
2078  }
2079  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2080  continue;
2081  }
2082  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2083  auto const row_func_name = CodegenUtil::getCalledFunctionName(row_func_call);
2084  if (row_func_name && *row_func_name == "row_process") {
2085  auto next_inst_it = inst_it;
2086  ++next_inst_it;
2087  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2088  auto& br_instr = bb_it->back();
2089  llvm::IRBuilder<> ir_builder(&br_instr);
2090  llvm::Value* err_lv = &*inst_it;
2091  llvm::Value* err_lv_returned_from_row_func = nullptr;
2092  if (run_with_dynamic_watchdog) {
2093  CHECK(pos);
2094  llvm::Value* call_watchdog_lv = nullptr;
2095  if (device_type == ExecutorDeviceType::GPU) {
2096  // In order to make sure all threads within a block see the same barrier,
2097  // only those blocks whose none of their threads have experienced the critical
2098  // edge will go through the dynamic watchdog computation
2099  CHECK(row_count);
2100  auto crit_edge_rem =
2101  (blockSize() & (blockSize() - 1))
2102  ? ir_builder.CreateSRem(
2103  row_count,
2104  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
2105  : ir_builder.CreateAnd(
2106  row_count,
2107  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
2108  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
2109  crit_edge_threshold->setName("crit_edge_threshold");
2110 
2111  // only those threads where pos < crit_edge_threshold go through dynamic
2112  // watchdog call
2113  call_watchdog_lv =
2114  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
2115  } else {
2116  // CPU path: run watchdog for every 64th row
2117  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
2118  call_watchdog_lv = ir_builder.CreateICmp(
2119  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
2120  }
2121  CHECK(call_watchdog_lv);
2122  auto error_check_bb = bb_it->splitBasicBlock(
2123  llvm::BasicBlock::iterator(br_instr), ".error_check");
2124  auto& watchdog_br_instr = bb_it->back();
2125 
2126  auto watchdog_check_bb = llvm::BasicBlock::Create(
2127  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
2128  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
2129  auto detected_timeout = watchdog_ir_builder.CreateCall(
2130  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
2131  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
2132  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
2133  watchdog_ir_builder.CreateBr(error_check_bb);
2134 
2135  llvm::ReplaceInstWithInst(
2136  &watchdog_br_instr,
2137  llvm::BranchInst::Create(
2138  watchdog_check_bb, error_check_bb, call_watchdog_lv));
2139  ir_builder.SetInsertPoint(&br_instr);
2140  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2141 
2142  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
2143  unified_err_lv->addIncoming(err_lv, &*bb_it);
2144  err_lv = unified_err_lv;
2145  } else if (run_with_allowing_runtime_interrupt) {
2146  CHECK(pos);
2147  llvm::Value* call_check_interrupt_lv{nullptr};
2148  llvm::Value* interrupt_err_lv{nullptr};
2149  llvm::BasicBlock* error_check_bb{nullptr};
2150  llvm::BasicBlock* interrupt_check_bb{nullptr};
2151  llvm::Instruction* check_interrupt_br_instr{nullptr};
2152 
2153  auto has_loop_join = std::any_of(
2154  join_loops.begin(), join_loops.end(), [](const JoinLoop& join_loop) {
2155  return join_loop.isNestedLoopJoin();
2156  });
2157  auto codegen_interrupt_checker = [&]() {
2158  error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
2159  ".error_check");
2160  check_interrupt_br_instr = &bb_it->back();
2161 
2162  interrupt_check_bb = llvm::BasicBlock::Create(
2163  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2164  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2165  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2166  cgen_state_->module_->getFunction("check_interrupt"), {});
2167  interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
2168  detected_interrupt,
2170  err_lv);
2171  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2172  };
2173  if (has_loop_join) {
2174  codegen_interrupt_checker();
2175  CHECK(interrupt_check_bb);
2176  CHECK(check_interrupt_br_instr);
2177  llvm::ReplaceInstWithInst(check_interrupt_br_instr,
2178  llvm::BranchInst::Create(interrupt_check_bb));
2179  ir_builder.SetInsertPoint(&br_instr);
2180  err_lv = interrupt_err_lv;
2181  } else {
2182  if (device_type == ExecutorDeviceType::GPU) {
2183  // approximate how many times the %pos variable
2184  // is increased --> the number of iteration
2185  // here we calculate the # bit shift by considering grid/block/fragment
2186  // sizes since if we use the fixed one (i.e., per 64-th increment) some CUDA
2187  // threads cannot enter the interrupt checking block depending on the
2188  // fragment size --> a thread may not take care of 64 threads if an outer
2189  // table is not sufficiently large, and so cannot be interrupted
2190  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
2191  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
2192  int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
2193  uint64_t interrupt_checking_freq = 32;
2194  auto freq_control_knob = g_running_query_interrupt_freq;
2195  CHECK_GT(freq_control_knob, 0);
2196  CHECK_LE(freq_control_knob, 1.0);
2197  if (!input_table_infos.empty()) {
2198  const auto& outer_table_info = *input_table_infos.begin();
2199  auto num_outer_table_tuples =
2200  outer_table_info.info.getFragmentNumTuplesUpperBound();
2201  if (num_outer_table_tuples > 0) {
2202  // gridSize * blockSize --> pos_step (idx of the next row per thread)
2203  // we additionally multiply two to pos_step since the number of
2204  // dispatched blocks are double of the gridSize
2205  // # tuples (of fragment) / pos_step --> maximum # increment (K)
2206  // also we multiply 1 / freq_control_knob to K to control the frequency
2207  // So, needs to check the interrupt status more frequently? make K
2208  // smaller
2209  auto max_inc = uint64_t(
2210  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
2211  if (max_inc < 2) {
2212  // too small `max_inc`, so this correction is necessary to make
2213  // `interrupt_checking_freq` be valid (i.e., larger than zero)
2214  max_inc = 2;
2215  }
2216  auto calibrated_inc =
2217  uint64_t(floor(max_inc * (1 - freq_control_knob)));
2218  interrupt_checking_freq =
2219  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
2220  // add the coverage when interrupt_checking_freq > K
2221  // if so, some threads still cannot be branched to the interrupt checker
2222  // so we manually use smaller but close to the max_inc as freq
2223  if (interrupt_checking_freq > max_inc) {
2224  interrupt_checking_freq = max_inc / 2;
2225  }
2226  if (interrupt_checking_freq < 8) {
2227  // such small freq incurs too frequent interrupt status checking,
2228  // so we fixup to the minimum freq value at some reasonable degree
2229  interrupt_checking_freq = 8;
2230  }
2231  }
2232  }
2233  VLOG(1) << "Set the running query interrupt checking frequency: "
2234  << interrupt_checking_freq;
2235  // check the interrupt flag for every interrupt_checking_freq-th iteration
2236  llvm::Value* pos_shifted_per_iteration =
2237  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
2238  auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
2239  interrupt_checking_freq);
2240  call_check_interrupt_lv =
2241  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2242  interrupt_predicate,
2243  cgen_state_->llInt(int64_t(0LL)));
2244  } else {
2245  // CPU path: run interrupt checker for every 64th row
2246  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
2247  call_check_interrupt_lv =
2248  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2249  interrupt_predicate,
2250  cgen_state_->llInt(int64_t(0LL)));
2251  }
2252  codegen_interrupt_checker();
2253  CHECK(call_check_interrupt_lv);
2254  CHECK(interrupt_err_lv);
2255  CHECK(interrupt_check_bb);
2256  CHECK(error_check_bb);
2257  CHECK(check_interrupt_br_instr);
2258  llvm::ReplaceInstWithInst(
2259  check_interrupt_br_instr,
2260  llvm::BranchInst::Create(
2261  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
2262  ir_builder.SetInsertPoint(&br_instr);
2263  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2264 
2265  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
2266  unified_err_lv->addIncoming(err_lv, &*bb_it);
2267  err_lv = unified_err_lv;
2268  }
2269  }
2270  if (!err_lv_returned_from_row_func) {
2271  err_lv_returned_from_row_func = err_lv;
2272  }
2273  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
2274  // let kernel execution finish as expected, regardless of the observed error,
2275  // unless it is from the dynamic watchdog where all threads within that block
2276  // return together.
2277  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2278  err_lv,
2280  } else {
2281  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
2282  err_lv,
2283  cgen_state_->llInt(static_cast<int32_t>(0)));
2284  }
2285  auto error_bb = llvm::BasicBlock::Create(
2286  cgen_state_->context_, ".error_exit", query_func, new_bb);
2287  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
2288  llvm::CallInst::Create(
2289  cgen_state_->module_->getFunction("record_error_code"),
2290  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
2291  "",
2292  error_bb);
2293  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2294  llvm::ReplaceInstWithInst(&br_instr,
2295  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2296  done_splitting = true;
2297  break;
2298  }
2299  }
2300  }
2301  CHECK(done_splitting);
2302 }
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
double g_running_query_interrupt_freq
Definition: Execute.cpp:137
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
QuerySessionId current_query_session_
Definition: Execute.h:1576
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:81
#define CHECK_GT(x, y)
Definition: Logger.h:305
std::shared_lock< T > shared_lock
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1622
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LE(x, y)
Definition: Logger.h:304
unsigned gridSize() const
Definition: Execute.cpp:4318
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
#define CHECK(condition)
Definition: Logger.h:291
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
unsigned blockSize() const
Definition: Execute.cpp:4332
#define VLOG(n)
Definition: Logger.h:388
std::vector< std::unique_ptr< ExecutionKernel > > Executor::createKernels ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit,
ColumnFetcher column_fetcher,
const std::vector< InputTableInfo > &  table_infos,
const ExecutionOptions eo,
const bool  is_agg,
const bool  allow_single_frag_table_opt,
const size_t  context_count,
const QueryCompilationDescriptor query_comp_desc,
const QueryMemoryDescriptor query_mem_desc,
RenderInfo render_info,
std::unordered_set< int > &  available_gpus,
int &  available_cpus 
)
private

Determines execution dispatch mode and required fragments for a given query step, then creates kernels to execute the query and returns them for launch.

Definition at line 2874 of file Execute.cpp.

References ExecutionOptions::allow_multifrag, CHECK, CHECK_GE, CHECK_GT, anonymous_namespace{Execute.cpp}::checkWorkUnitWatchdog(), data_mgr_, deviceCount(), g_inner_join_fragment_skipping, getColLazyFetchInfo(), QueryCompilationDescriptor::getDeviceType(), QueryMemoryDescriptor::getEntryCount(), SharedKernelContext::getFragOffsets(), Data_Namespace::DataMgr::getMemoryInfo(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, ExecutionOptions::gpu_input_mem_limit_percent, Data_Namespace::GPU_LEVEL, anonymous_namespace{Execute.cpp}::has_lazy_fetched_columns(), logger::INFO, RelAlgExecutionUnit::input_descs, KernelPerFragment, LOG, MultifragmentKernel, ExecutionOptions::outer_fragment_indices, plan_state_, Projection, query_mem_desc, RelAlgExecutionUnit::target_exprs, QueryMemoryDescriptor::toString(), RelAlgExecutionUnit::use_bump_allocator, VLOG, and ExecutionOptions::with_watchdog.

Referenced by executeWorkUnitImpl().

2887  {
2888  std::vector<std::unique_ptr<ExecutionKernel>> execution_kernels;
2889 
2890  QueryFragmentDescriptor fragment_descriptor(
2891  ra_exe_unit,
2892  table_infos,
2893  query_comp_desc.getDeviceType() == ExecutorDeviceType::GPU
2895  : std::vector<Data_Namespace::MemoryInfo>{},
2898  CHECK(!ra_exe_unit.input_descs.empty());
2899 
2900  const auto device_type = query_comp_desc.getDeviceType();
2901  const bool uses_lazy_fetch =
2902  plan_state_->allow_lazy_fetch_ &&
2904  const bool use_multifrag_kernel = (device_type == ExecutorDeviceType::GPU) &&
2905  eo.allow_multifrag && (!uses_lazy_fetch || is_agg);
2906  const auto device_count = deviceCount(device_type);
2907  CHECK_GT(device_count, 0);
2908 
2909  fragment_descriptor.buildFragmentKernelMap(ra_exe_unit,
2910  shared_context.getFragOffsets(),
2911  device_count,
2912  device_type,
2913  use_multifrag_kernel,
2915  this);
2916  if (eo.with_watchdog && fragment_descriptor.shouldCheckWorkUnitWatchdog()) {
2917  checkWorkUnitWatchdog(ra_exe_unit, table_infos, device_type, device_count);
2918  }
2919 
2920  if (use_multifrag_kernel) {
2921  VLOG(1) << "Creating multifrag execution kernels";
2922  VLOG(1) << query_mem_desc.toString();
2923 
2924  // NB: We should never be on this path when the query is retried because of running
2925  // out of group by slots; also, for scan only queries on CPU we want the
2926  // high-granularity, fragment by fragment execution instead. For scan only queries on
2927  // GPU, we want the multifrag kernel path to save the overhead of allocating an output
2928  // buffer per fragment.
2929  auto multifrag_kernel_dispatch = [&ra_exe_unit,
2930  &execution_kernels,
2931  &column_fetcher,
2932  &eo,
2933  &query_comp_desc,
2934  &query_mem_desc,
2935  render_info](const int device_id,
2936  const FragmentsList& frag_list,
2937  const int64_t rowid_lookup_key) {
2938  execution_kernels.emplace_back(
2939  std::make_unique<ExecutionKernel>(ra_exe_unit,
2941  device_id,
2942  eo,
2943  column_fetcher,
2944  query_comp_desc,
2945  query_mem_desc,
2946  frag_list,
2948  render_info,
2949  rowid_lookup_key));
2950  };
2951  fragment_descriptor.assignFragsToMultiDispatch(multifrag_kernel_dispatch);
2952  } else {
2953  VLOG(1) << "Creating one execution kernel per fragment";
2954  VLOG(1) << query_mem_desc.toString();
2955 
2956  if (!ra_exe_unit.use_bump_allocator && allow_single_frag_table_opt &&
2957  (query_mem_desc.getQueryDescriptionType() == QueryDescriptionType::Projection) &&
2958  table_infos.size() == 1 && table_infos.front().table_key.table_id > 0) {
2959  const auto max_frag_size =
2960  table_infos.front().info.getFragmentNumTuplesUpperBound();
2961  if (max_frag_size < query_mem_desc.getEntryCount()) {
2962  LOG(INFO) << "Lowering scan limit from " << query_mem_desc.getEntryCount()
2963  << " to match max fragment size " << max_frag_size
2964  << " for kernel per fragment execution path.";
2965  throw CompilationRetryNewScanLimit(max_frag_size);
2966  }
2967  }
2968 
2969  size_t frag_list_idx{0};
2970  auto fragment_per_kernel_dispatch = [&ra_exe_unit,
2971  &execution_kernels,
2972  &column_fetcher,
2973  &eo,
2974  &frag_list_idx,
2975  &device_type,
2976  &query_comp_desc,
2977  &query_mem_desc,
2978  render_info](const int device_id,
2979  const FragmentsList& frag_list,
2980  const int64_t rowid_lookup_key) {
2981  if (!frag_list.size()) {
2982  return;
2983  }
2984  CHECK_GE(device_id, 0);
2985 
2986  execution_kernels.emplace_back(
2987  std::make_unique<ExecutionKernel>(ra_exe_unit,
2988  device_type,
2989  device_id,
2990  eo,
2991  column_fetcher,
2992  query_comp_desc,
2993  query_mem_desc,
2994  frag_list,
2996  render_info,
2997  rowid_lookup_key));
2998  ++frag_list_idx;
2999  };
3000 
3001  fragment_descriptor.assignFragsToKernelDispatch(fragment_per_kernel_dispatch,
3002  ra_exe_unit);
3003  }
3004 
3005  return execution_kernels;
3006 }
bool is_agg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1558
ExecutorDeviceType getDeviceType() const
void checkWorkUnitWatchdog(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &table_infos, const ExecutorDeviceType device_type, const int device_count)
Definition: Execute.cpp:1822
const std::vector< uint64_t > & getFragOffsets()
std::string toString() const
#define LOG(tag)
Definition: Logger.h:285
std::vector< size_t > outer_fragment_indices
std::vector< ColumnLazyFetchInfo > getColLazyFetchInfo(const std::vector< Analyzer::Expr * > &target_exprs) const
Definition: Execute.cpp:992
std::vector< InputDescriptor > input_descs
#define CHECK_GE(x, y)
Definition: Logger.h:306
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:1297
#define CHECK_GT(x, y)
Definition: Logger.h:305
std::vector< FragmentsPerTable > FragmentsList
bool g_inner_join_fragment_skipping
Definition: Execute.cpp:94
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
std::vector< MemoryInfo > getMemoryInfo(const MemoryLevel memLevel) const
Definition: DataMgr.cpp:380
#define CHECK(condition)
Definition: Logger.h:291
double gpu_input_mem_limit_percent
bool has_lazy_fetched_columns(const std::vector< ColumnLazyFetchInfo > &fetched_cols)
Definition: Execute.cpp:2863
#define VLOG(n)
Definition: Logger.h:388

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

CudaMgr_Namespace::CudaMgr* Executor::cudaMgr ( ) const
inlineprivate

Definition at line 865 of file Execute.h.

References CHECK, data_mgr_, and Data_Namespace::DataMgr::getCudaMgr().

Referenced by deviceCount(), deviceCycles(), isArchPascalOrLater(), numBlocksPerMP(), and warpSize().

865  {
866  CHECK(data_mgr_);
867  auto cuda_mgr = data_mgr_->getCudaMgr();
868  CHECK(cuda_mgr);
869  return cuda_mgr;
870  }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:235
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1558
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int Executor::deviceCount ( const ExecutorDeviceType  device_type) const

Definition at line 1297 of file Execute.cpp.

References cudaMgr(), CudaMgr_Namespace::CudaMgr::getDeviceCount(), and GPU.

Referenced by createKernels(), and deviceCountForMemoryLevel().

1297  {
1298  if (device_type == ExecutorDeviceType::GPU) {
1299  return cudaMgr()->getDeviceCount();
1300  } else {
1301  return 1;
1302  }
1303 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:865
int getDeviceCount() const
Definition: CudaMgr.h:90

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int Executor::deviceCountForMemoryLevel ( const Data_Namespace::MemoryLevel  memory_level) const
private

Definition at line 1305 of file Execute.cpp.

References CPU, deviceCount(), GPU, and Data_Namespace::GPU_LEVEL.

Referenced by buildHashTableForQualifier().

1306  {
1307  return memory_level == GPU_LEVEL ? deviceCount(ExecutorDeviceType::GPU)
1308  : deviceCount(ExecutorDeviceType::CPU);
1309 }
int deviceCount(const ExecutorDeviceType) const
Definition: Execute.cpp:1297
ExecutorDeviceType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t Executor::deviceCycles ( int  milliseconds) const
private

Definition at line 4362 of file Execute.cpp.

References cudaMgr(), and CudaMgr_Namespace::CudaMgr::getAllDeviceProperties().

4362  {
4363  const auto& dev_props = cudaMgr()->getAllDeviceProperties();
4364  return static_cast<int64_t>(dev_props.front().clockKhz) * milliseconds;
4365 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:865
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:134

+ Here is the call graph for this function:

std::string Executor::dumpCache ( ) const

Definition at line 5486 of file Execute.cpp.

References agg_col_range_cache_, TableGenerations::asMap(), AggregatedColRange::asMap(), row_set_mem_owner_, and table_generations_.

5486  {
5487  std::stringstream ss;
5488  ss << "colRangeCache: ";
5489  for (auto& [phys_input, exp_range] : agg_col_range_cache_.asMap()) {
5490  ss << "{" << phys_input.col_id << ", " << phys_input.table_id
5491  << "} = " << exp_range.toString() << ", ";
5492  }
5493  ss << "stringDictGenerations: ";
5494  for (auto& [key, val] : row_set_mem_owner_->getStringDictionaryGenerations().asMap()) {
5495  ss << key << " = " << val << ", ";
5496  }
5497  ss << "tableGenerations: ";
5498  for (auto& [key, val] : table_generations_.asMap()) {
5499  ss << key << " = {" << val.tuple_count << ", " << val.start_rowid << "}, ";
5500  }
5501  ss << "\n";
5502  return ss.str();
5503 }
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:1572
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:1533
TableGenerations table_generations_
Definition: Execute.h:1573
const std::unordered_map< PhysicalInput, ExpressionRange > & asMap() const
const std::unordered_map< shared::TableKey, TableGeneration > & asMap() const

+ Here is the call graph for this function:

void Executor::enableRuntimeQueryInterrupt ( const double  runtime_query_check_freq,
const unsigned  pending_query_check_freq 
) const

Definition at line 5240 of file Execute.cpp.

References g_enable_runtime_query_interrupt, g_pending_query_interrupt_freq, and g_running_query_interrupt_freq.

5242  {
5243  // The only one scenario that we intentionally call this function is
5244  // to allow runtime query interrupt in QueryRunner for test cases.
5245  // Because test machine's default setting does not allow runtime query interrupt,
5246  // so we have to turn it on within test code if necessary.
5248  g_pending_query_interrupt_freq = pending_query_check_freq;
5249  g_running_query_interrupt_freq = runtime_query_check_freq;
5252  }
5253 }
double g_running_query_interrupt_freq
Definition: Execute.cpp:137
unsigned g_pending_query_interrupt_freq
Definition: Execute.cpp:136
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:133
void Executor::enrollQuerySession ( const QuerySessionId query_session,
const std::string &  query_str,
const std::string &  submitted_time_str,
const size_t  executor_id,
const QuerySessionStatus::QueryStatus  query_session_status 
)

Definition at line 5057 of file Execute.cpp.

References addToQuerySessionList(), current_query_session_, and executor_session_mutex_.

5062  {
5063  // enroll the query session into the Executor's session map
5065  if (query_session.empty()) {
5066  return;
5067  }
5068 
5069  addToQuerySessionList(query_session,
5070  query_str,
5071  submitted_time_str,
5072  executor_id,
5073  query_session_status,
5074  session_write_lock);
5075 
5076  if (query_session_status == QuerySessionStatus::QueryStatus::RUNNING_QUERY_KERNEL) {
5077  current_query_session_ = query_session;
5078  }
5079 }
QuerySessionId current_query_session_
Definition: Execute.h:1576
std::unique_lock< T > unique_lock
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
bool addToQuerySessionList(const QuerySessionId &query_session, const std::string &query_str, const std::string &submitted, const size_t executor_id, const QuerySessionStatus::QueryStatus query_status, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
Definition: Execute.cpp:5086

+ Here is the call graph for this function:

ResultSetPtr Executor::executeExplain ( const QueryCompilationDescriptor query_comp_desc)
private

Definition at line 2490 of file Execute.cpp.

References QueryCompilationDescriptor::getIR().

Referenced by executeWorkUnitImpl().

2490  {
2491  return std::make_shared<ResultSet>(query_comp_desc.getIR());
2492 }

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int32_t Executor::executePlanWithGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< size_t >  outer_tab_frag_ids,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const shared::TableKey outer_table_key,
const int64_t  limit,
const uint32_t  start_rowid,
const uint32_t  num_tables,
const bool  allow_runtime_interrupt,
RenderInfo render_info,
const bool  optimize_cuda_block_and_grid_sizes,
const int64_t  rows_to_process = -1 
)
private

Definition at line 4028 of file Execute.cpp.

References anonymous_namespace{Utm.h}::a, blockSize(), CHECK, CHECK_NE, anonymous_namespace{Execute.cpp}::check_rows_less_than_needed(), checkIsQuerySessionInterrupted(), CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_GPU_MEM, ERR_OUT_OF_RENDER_MEM, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY, ERR_WIDTH_BUCKET_INVALID_ARGUMENT, report::error_code(), executor_session_mutex_, logger::FATAL, g_enable_dynamic_watchdog, CompilationResult::generated_code, getCurrentQuerySession(), QueryMemoryDescriptor::getEntryCount(), getJoinHashTablePtrs(), QueryExecutionContext::getRowSet(), GpuSharedMemoryContext::getSharedMemorySize(), CompilationResult::gpu_smem_context, gridSize(), RelAlgExecutionUnit::groupby_exprs, INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, interrupted_, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, shared::printContainer(), QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, RenderInfo::render_allocator_map_ptr, RelAlgExecutionUnit::scan_limit, serializeLiterals(), QueryMemoryDescriptor::setEntryCount(), RelAlgExecutionUnit::union_all, RenderInfo::useCudaBuffers(), and VLOG.

4048  {
4049  auto timer = DEBUG_TIMER(__func__);
4051  // TODO: get results via a separate method, but need to do something with literals.
4052  CHECK(!results || !(*results));
4053  if (col_buffers.empty()) {
4054  return 0;
4055  }
4056  CHECK_NE(ra_exe_unit.groupby_exprs.size(), size_t(0));
4057  // TODO(alex):
4058  // 1. Optimize size (make keys more compact).
4059  // 2. Resize on overflow.
4060  // 3. Optimize runtime.
4061  auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
4062  int32_t error_code = 0;
4063  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
4064  if (allow_runtime_interrupt) {
4065  bool isInterrupted = false;
4066  {
4069  const auto query_session = getCurrentQuerySession(session_read_lock);
4070  isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
4071  }
4072  if (isInterrupted) {
4074  }
4075  }
4076  if (g_enable_dynamic_watchdog && interrupted_.load()) {
4077  return ERR_INTERRUPTED;
4078  }
4079 
4080  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
4081  if (render_info && render_info->useCudaBuffers()) {
4082  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
4083  }
4084 
4085  VLOG(2) << "bool(ra_exe_unit.union_all)=" << bool(ra_exe_unit.union_all)
4086  << " ra_exe_unit.input_descs="
4087  << shared::printContainer(ra_exe_unit.input_descs)
4088  << " ra_exe_unit.input_col_descs="
4089  << shared::printContainer(ra_exe_unit.input_col_descs)
4090  << " ra_exe_unit.scan_limit=" << ra_exe_unit.scan_limit
4091  << " num_rows=" << shared::printContainer(num_rows)
4092  << " frag_offsets=" << shared::printContainer(frag_offsets)
4093  << " query_exe_context->query_buffers_->num_rows_="
4094  << query_exe_context->query_buffers_->num_rows_
4095  << " query_exe_context->query_mem_desc_.getEntryCount()="
4096  << query_exe_context->query_mem_desc_.getEntryCount()
4097  << " device_id=" << device_id << " outer_table_key=" << outer_table_key
4098  << " scan_limit=" << scan_limit << " start_rowid=" << start_rowid
4099  << " num_tables=" << num_tables;
4100 
4101  RelAlgExecutionUnit ra_exe_unit_copy = ra_exe_unit;
4102  // For UNION ALL, filter out input_descs and input_col_descs that are not associated
4103  // with outer_table_id.
4104  if (ra_exe_unit_copy.union_all) {
4105  // Sort outer_table_id first, then pop the rest off of ra_exe_unit_copy.input_descs.
4106  std::stable_sort(ra_exe_unit_copy.input_descs.begin(),
4107  ra_exe_unit_copy.input_descs.end(),
4108  [outer_table_key](auto const& a, auto const& b) {
4109  return a.getTableKey() == outer_table_key &&
4110  b.getTableKey() != outer_table_key;
4111  });
4112  while (!ra_exe_unit_copy.input_descs.empty() &&
4113  ra_exe_unit_copy.input_descs.back().getTableKey() != outer_table_key) {
4114  ra_exe_unit_copy.input_descs.pop_back();
4115  }
4116  // Filter ra_exe_unit_copy.input_col_descs.
4117  ra_exe_unit_copy.input_col_descs.remove_if(
4118  [outer_table_key](auto const& input_col_desc) {
4119  return input_col_desc->getScanDesc().getTableKey() != outer_table_key;
4120  });
4121  query_exe_context->query_mem_desc_.setEntryCount(ra_exe_unit_copy.scan_limit);
4122  }
4123 
4124  if (device_type == ExecutorDeviceType::CPU) {
4125  const int32_t scan_limit_for_query =
4126  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
4127  const int32_t max_matched = scan_limit_for_query == 0
4128  ? query_exe_context->query_mem_desc_.getEntryCount()
4129  : scan_limit_for_query;
4130  CpuCompilationContext* cpu_generated_code =
4131  dynamic_cast<CpuCompilationContext*>(compilation_result.generated_code.get());
4132  CHECK(cpu_generated_code);
4133  query_exe_context->launchCpuCode(ra_exe_unit_copy,
4134  cpu_generated_code,
4135  hoist_literals,
4136  hoist_buf,
4137  col_buffers,
4138  num_rows,
4139  frag_offsets,
4140  max_matched,
4141  &error_code,
4142  start_rowid,
4143  num_tables,
4144  join_hash_table_ptrs,
4145  rows_to_process);
4146  } else {
4147  try {
4148  GpuCompilationContext* gpu_generated_code =
4149  dynamic_cast<GpuCompilationContext*>(compilation_result.generated_code.get());
4150  CHECK(gpu_generated_code);
4151  query_exe_context->launchGpuCode(
4152  ra_exe_unit_copy,
4153  gpu_generated_code,
4154  hoist_literals,
4155  hoist_buf,
4156  col_buffers,
4157  num_rows,
4158  frag_offsets,
4159  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit,
4160  data_mgr,
4161  blockSize(),
4162  gridSize(),
4163  device_id,
4164  compilation_result.gpu_smem_context.getSharedMemorySize(),
4165  &error_code,
4166  num_tables,
4167  allow_runtime_interrupt,
4168  join_hash_table_ptrs,
4169  render_allocator_map_ptr,
4170  optimize_cuda_block_and_grid_sizes);
4171  } catch (const OutOfMemory&) {
4172  return ERR_OUT_OF_GPU_MEM;
4173  } catch (const OutOfRenderMemory&) {
4174  return ERR_OUT_OF_RENDER_MEM;
4175  } catch (const StreamingTopNNotSupportedInRenderQuery&) {
4177  } catch (const std::exception& e) {
4178  LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
4179  }
4180  }
4181 
4182  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
4183  error_code == Executor::ERR_DIV_BY_ZERO ||
4184  error_code == Executor::ERR_OUT_OF_TIME ||
4185  error_code == Executor::ERR_INTERRUPTED ||
4187  error_code == Executor::ERR_GEOS ||
4189  return error_code;
4190  }
4191 
4192  if (results && error_code != Executor::ERR_OVERFLOW_OR_UNDERFLOW &&
4193  error_code != Executor::ERR_DIV_BY_ZERO && !render_allocator_map_ptr) {
4194  *results = query_exe_context->getRowSet(ra_exe_unit_copy,
4195  query_exe_context->query_mem_desc_);
4196  CHECK(*results);
4197  VLOG(2) << "results->rowCount()=" << (*results)->rowCount();
4198  (*results)->holdLiterals(hoist_buf);
4199  }
4200  if (error_code < 0 && render_allocator_map_ptr) {
4201  auto const adjusted_scan_limit =
4202  ra_exe_unit_copy.union_all ? ra_exe_unit_copy.scan_limit : scan_limit;
4203  // More rows passed the filter than available slots. We don't have a count to check,
4204  // so assume we met the limit if a scan limit is set
4205  if (adjusted_scan_limit != 0) {
4206  return 0;
4207  } else {
4208  return error_code;
4209  }
4210  }
4211  if (results && error_code &&
4212  (!scan_limit || check_rows_less_than_needed(*results, scan_limit))) {
4213  return error_code; // unlucky, not enough results and we ran out of slots
4214  }
4215 
4216  return 0;
4217 }
bool checkIsQuerySessionInterrupted(const std::string &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:5220
bool useCudaBuffers() const
Definition: RenderInfo.cpp:54
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
std::vector< int8_t * > getJoinHashTablePtrs(const ExecutorDeviceType device_type, const int device_id)
Definition: Execute.cpp:4219
void setEntryCount(const size_t val)
std::atomic< bool > interrupted_
Definition: Execute.h:1543
GpuSharedMemoryContext gpu_smem_context
const std::optional< bool > union_all
#define LOG(tag)
Definition: Logger.h:285
size_t getSharedMemorySize() const
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t start_rowid, const uint32_t num_tables, const std::vector< int8_t * > &join_hash_tables, const int64_t num_rows_to_process=-1)
std::vector< InputDescriptor > input_descs
static const int32_t ERR_GEOS
Definition: Execute.h:1629
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:81
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
constexpr double a
Definition: Utm.h:32
std::shared_lock< T > shared_lock
std::unique_ptr< QueryMemoryInitializer > query_buffers_
static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY
Definition: Execute.h:1627
static const int32_t ERR_DIV_BY_ZERO
Definition: Execute.h:1615
ResultSetPtr getRowSet(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc) const
#define INJECT_TIMER(DESC)
Definition: measure.h:96
#define CHECK_NE(x, y)
Definition: Logger.h:302
static const int32_t ERR_OUT_OF_RENDER_MEM
Definition: Execute.h:1619
int32_t executePlanWithGroupBy(const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr *results, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< size_t > outer_tab_frag_ids, QueryExecutionContext *, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *, const int device_id, const shared::TableKey &outer_table_key, const int64_t limit, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const bool optimize_cuda_block_and_grid_sizes, const int64_t rows_to_process=-1)
Definition: Execute.cpp:4028
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:1621
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1622
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:1628
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:1616
std::shared_ptr< CompilationContext > generated_code
QueryMemoryDescriptor query_mem_desc_
QuerySessionId & getCurrentQuerySession(heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:4952
std::vector< int8_t > serializeLiterals(const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
Definition: Execute.cpp:1035
unsigned gridSize() const
Definition: Execute.cpp:4318
std::unordered_map< int, CgenState::LiteralValues > literal_values
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
Definition: RenderInfo.h:33
static const int32_t ERR_WIDTH_BUCKET_INVALID_ARGUMENT
Definition: Execute.h:1630
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
bool check_rows_less_than_needed(const ResultSetPtr &results, const size_t scan_limit)
Definition: Execute.cpp:4021
def error_code
Definition: report.py:244
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:107
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
unsigned blockSize() const
Definition: Execute.cpp:4332
#define VLOG(n)
Definition: Logger.h:388
std::vector< int64_t * > launchGpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CompilationContext *compilation_context, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const bool allow_runtime_interrupt, const std::vector< int8_t * > &join_hash_tables, RenderAllocatorMap *render_allocator_map, bool optimize_cuda_block_and_grid_sizes)

+ Here is the call graph for this function:

int32_t Executor::executePlanWithoutGroupBy ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationResult compilation_result,
const bool  hoist_literals,
ResultSetPtr results,
const std::vector< Analyzer::Expr * > &  target_exprs,
const ExecutorDeviceType  device_type,
std::vector< std::vector< const int8_t * >> &  col_buffers,
QueryExecutionContext query_exe_context,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const uint32_t  start_rowid,
const uint32_t  num_tables,
const bool  allow_runtime_interrupt,
RenderInfo render_info,
const bool  optimize_cuda_block_and_grid_sizes,
const int64_t  rows_to_process = -1 
)
private

Definition at line 3802 of file Execute.cpp.

References blockSize(), CHECK, CHECK_EQ, checkIsQuerySessionInterrupted(), CPU, DEBUG_TIMER, ERR_DIV_BY_ZERO, ERR_GEOS, ERR_INTERRUPTED, ERR_OUT_OF_GPU_MEM, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, ERR_WIDTH_BUCKET_INVALID_ARGUMENT, report::error_code(), RelAlgExecutionUnit::estimator, QueryExecutionContext::estimator_result_set_, executor_session_mutex_, logger::FATAL, g_bigint_count, g_enable_dynamic_watchdog, CompilationResult::generated_code, get_target_info(), QueryExecutionContext::getAggInitValForIndex(), getCurrentQuerySession(), getJoinHashTablePtrs(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, CompilationResult::gpu_smem_context, gridSize(), INJECT_TIMER, interrupted_, is_distinct_target(), heavyai::InSituFlagsOwnerInterface::isInSitu(), GpuSharedMemoryContext::isSharedMemoryUsed(), kAVG, kCOUNT, kSAMPLE, QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), CompilationResult::literal_values, LOG, QueryExecutionContext::query_buffers_, QueryExecutionContext::query_mem_desc_, reduceResults(), RenderInfo::render_allocator_map_ptr, serializeLiterals(), takes_float_argument(), and RenderInfo::useCudaBuffers().

3820  {
3822  auto timer = DEBUG_TIMER(__func__);
3823  CHECK(!results || !(*results));
3824  if (col_buffers.empty()) {
3825  return 0;
3826  }
3827 
3828  RenderAllocatorMap* render_allocator_map_ptr = nullptr;
3829  if (render_info) {
3830  // TODO(adb): make sure that we either never get here in the CPU case, or if we do get
3831  // here, we are in non-insitu mode.
3832  CHECK(render_info->useCudaBuffers() || !render_info->isInSitu())
3833  << "CUDA disabled rendering in the executePlanWithoutGroupBy query path is "
3834  "currently unsupported.";
3835  render_allocator_map_ptr = render_info->render_allocator_map_ptr.get();
3836  }
3837 
3838  int32_t error_code = 0;
3839  std::vector<int64_t*> out_vec;
3840  const auto hoist_buf = serializeLiterals(compilation_result.literal_values, device_id);
3841  const auto join_hash_table_ptrs = getJoinHashTablePtrs(device_type, device_id);
3842  std::unique_ptr<OutVecOwner> output_memory_scope;
3843  if (allow_runtime_interrupt) {
3844  bool isInterrupted = false;
3845  {
3848  const auto query_session = getCurrentQuerySession(session_read_lock);
3849  isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
3850  }
3851  if (isInterrupted) {
3853  }
3854  }
3855  if (g_enable_dynamic_watchdog && interrupted_.load()) {
3857  }
3858  if (device_type == ExecutorDeviceType::CPU) {
3859  CpuCompilationContext* cpu_generated_code =
3860  dynamic_cast<CpuCompilationContext*>(compilation_result.generated_code.get());
3861  CHECK(cpu_generated_code);
3862  out_vec = query_exe_context->launchCpuCode(ra_exe_unit,
3863  cpu_generated_code,
3864  hoist_literals,
3865  hoist_buf,
3866  col_buffers,
3867  num_rows,
3868  frag_offsets,
3869  0,
3870  &error_code,
3871  start_rowid,
3872  num_tables,
3873  join_hash_table_ptrs,
3874  rows_to_process);
3875  output_memory_scope.reset(new OutVecOwner(out_vec));
3876  } else {
3877  GpuCompilationContext* gpu_generated_code =
3878  dynamic_cast<GpuCompilationContext*>(compilation_result.generated_code.get());
3879  CHECK(gpu_generated_code);
3880  try {
3881  out_vec = query_exe_context->launchGpuCode(
3882  ra_exe_unit,
3883  gpu_generated_code,
3884  hoist_literals,
3885  hoist_buf,
3886  col_buffers,
3887  num_rows,
3888  frag_offsets,
3889  0,
3890  data_mgr,
3891  blockSize(),
3892  gridSize(),
3893  device_id,
3894  compilation_result.gpu_smem_context.getSharedMemorySize(),
3895  &error_code,
3896  num_tables,
3897  allow_runtime_interrupt,
3898  join_hash_table_ptrs,
3899  render_allocator_map_ptr,
3900  optimize_cuda_block_and_grid_sizes);
3901  output_memory_scope.reset(new OutVecOwner(out_vec));
3902  } catch (const OutOfMemory&) {
3903  return ERR_OUT_OF_GPU_MEM;
3904  } catch (const std::exception& e) {
3905  LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
3906  }
3907  }
3908  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
3909  error_code == Executor::ERR_DIV_BY_ZERO ||
3910  error_code == Executor::ERR_OUT_OF_TIME ||
3911  error_code == Executor::ERR_INTERRUPTED ||
3913  error_code == Executor::ERR_GEOS ||
3915  return error_code;
3916  }
3917  if (ra_exe_unit.estimator) {
3918  CHECK(!error_code);
3919  if (results) {
3920  *results =
3921  std::shared_ptr<ResultSet>(query_exe_context->estimator_result_set_.release());
3922  }
3923  return 0;
3924  }
3925  // Expect delayed results extraction (used for sub-fragments) for estimator only;
3926  CHECK(results);
3927  std::vector<int64_t> reduced_outs;
3928  const auto num_frags = col_buffers.size();
3929  const size_t entry_count =
3930  device_type == ExecutorDeviceType::GPU
3931  ? (compilation_result.gpu_smem_context.isSharedMemoryUsed()
3932  ? 1
3933  : blockSize() * gridSize() * num_frags)
3934  : num_frags;
3935  if (size_t(1) == entry_count) {
3936  for (auto out : out_vec) {
3937  CHECK(out);
3938  reduced_outs.push_back(*out);
3939  }
3940  } else {
3941  size_t out_vec_idx = 0;
3942 
3943  for (const auto target_expr : target_exprs) {
3944  const auto agg_info = get_target_info(target_expr, g_bigint_count);
3945  CHECK(agg_info.is_agg || dynamic_cast<Analyzer::Constant*>(target_expr))
3946  << target_expr->toString();
3947 
3948  const int num_iterations = agg_info.sql_type.is_geometry()
3949  ? agg_info.sql_type.get_physical_coord_cols()
3950  : 1;
3951 
3952  for (int i = 0; i < num_iterations; i++) {
3953  int64_t val1;
3954  const bool float_argument_input = takes_float_argument(agg_info);
3955  if (is_distinct_target(agg_info) ||
3956  shared::is_any<kAPPROX_QUANTILE, kMODE>(agg_info.agg_kind)) {
3957  bool const check = shared::
3958  is_any<kCOUNT, kAPPROX_COUNT_DISTINCT, kAPPROX_QUANTILE, kMODE, kCOUNT_IF>(
3959  agg_info.agg_kind);
3960  CHECK(check) << agg_info.agg_kind;
3961  val1 = out_vec[out_vec_idx][0];
3962  error_code = 0;
3963  } else {
3964  const auto chosen_bytes = static_cast<size_t>(
3965  query_exe_context->query_mem_desc_.getPaddedSlotWidthBytes(out_vec_idx));
3966  std::tie(val1, error_code) = Executor::reduceResults(
3967  agg_info.agg_kind,
3968  agg_info.sql_type,
3969  query_exe_context->getAggInitValForIndex(out_vec_idx),
3970  float_argument_input ? sizeof(int32_t) : chosen_bytes,
3971  out_vec[out_vec_idx],
3972  entry_count,
3973  false,
3974  float_argument_input);
3975  }
3976  if (error_code) {
3977  break;
3978  }
3979  reduced_outs.push_back(val1);
3980  if (agg_info.agg_kind == kAVG ||
3981  (agg_info.agg_kind == kSAMPLE &&
3982  (agg_info.sql_type.is_varlen() || agg_info.sql_type.is_geometry()))) {
3983  const auto chosen_bytes = static_cast<size_t>(
3984  query_exe_context->query_mem_desc_.getPaddedSlotWidthBytes(out_vec_idx +
3985  1));
3986  int64_t val2;
3987  std::tie(val2, error_code) = Executor::reduceResults(
3988  agg_info.agg_kind == kAVG ? kCOUNT : agg_info.agg_kind,
3989  agg_info.sql_type,
3990  query_exe_context->getAggInitValForIndex(out_vec_idx + 1),
3991  float_argument_input ? sizeof(int32_t) : chosen_bytes,
3992  out_vec[out_vec_idx + 1],
3993  entry_count,
3994  false,
3995  false);
3996  if (error_code) {
3997  break;
3998  }
3999  reduced_outs.push_back(val2);
4000  ++out_vec_idx;
4001  }
4002  ++out_vec_idx;
4003  }
4004  }
4005  }
4006 
4007  if (error_code) {
4008  return error_code;
4009  }
4010 
4011  CHECK_EQ(size_t(1), query_exe_context->query_buffers_->result_sets_.size());
4012  auto rows_ptr = std::shared_ptr<ResultSet>(
4013  query_exe_context->query_buffers_->result_sets_[0].release());
4014  rows_ptr->fillOneEntry(reduced_outs);
4015  *results = std::move(rows_ptr);
4016  return error_code;
4017 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
bool checkIsQuerySessionInterrupted(const std::string &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:5220
bool useCudaBuffers() const
Definition: RenderInfo.cpp:54
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
std::vector< int8_t * > getJoinHashTablePtrs(const ExecutorDeviceType device_type, const int device_id)
Definition: Execute.cpp:4219
std::atomic< bool > interrupted_
Definition: Execute.h:1543
GpuSharedMemoryContext gpu_smem_context
#define LOG(tag)
Definition: Logger.h:285
size_t getSharedMemorySize() const
std::vector< int64_t * > launchCpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t start_rowid, const uint32_t num_tables, const std::vector< int8_t * > &join_hash_tables, const int64_t num_rows_to_process=-1)
static std::pair< int64_t, int32_t > reduceResults(const SQLAgg agg, const SQLTypeInfo &ti, const int64_t agg_init_val, const int8_t out_byte_width, const int64_t *out_vec, const size_t out_vec_sz, const bool is_group_by, const bool float_argument_input)
Definition: Execute.cpp:1312
static const int32_t ERR_GEOS
Definition: Execute.h:1629
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:81
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
int32_t executePlanWithoutGroupBy(const RelAlgExecutionUnit &ra_exe_unit, const CompilationResult &, const bool hoist_literals, ResultSetPtr *results, const std::vector< Analyzer::Expr * > &target_exprs, const ExecutorDeviceType device_type, std::vector< std::vector< const int8_t * >> &col_buffers, QueryExecutionContext *query_exe_context, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, Data_Namespace::DataMgr *data_mgr, const int device_id, const uint32_t start_rowid, const uint32_t num_tables, const bool allow_runtime_interrupt, RenderInfo *render_info, const bool optimize_cuda_block_and_grid_sizes, const int64_t rows_to_process=-1)
Definition: Execute.cpp:3802
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
std::shared_lock< T > shared_lock
std::unique_ptr< QueryMemoryInitializer > query_buffers_
static const int32_t ERR_DIV_BY_ZERO
Definition: Execute.h:1615
#define INJECT_TIMER(DESC)
Definition: measure.h:96
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:1621
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1622
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
int64_t getAggInitValForIndex(const size_t index) const
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:1628
const std::shared_ptr< Analyzer::Estimator > estimator
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:1616
std::shared_ptr< CompilationContext > generated_code
QueryMemoryDescriptor query_mem_desc_
QuerySessionId & getCurrentQuerySession(heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:4952
std::vector< int8_t > serializeLiterals(const std::unordered_map< int, CgenState::LiteralValues > &literals, const int device_id)
Definition: Execute.cpp:1035
Definition: sqldefs.h:78
unsigned gridSize() const
Definition: Execute.cpp:4318
std::unordered_map< int, CgenState::LiteralValues > literal_values
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
Definition: RenderInfo.h:33
static const int32_t ERR_WIDTH_BUCKET_INVALID_ARGUMENT
Definition: Execute.h:1630
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
def error_code
Definition: report.py:244
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
unsigned blockSize() const
Definition: Execute.cpp:4332
std::unique_ptr< ResultSet > estimator_result_set_
Definition: sqldefs.h:74
std::vector< int64_t * > launchGpuCode(const RelAlgExecutionUnit &ra_exe_unit, const CompilationContext *compilation_context, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const bool allow_runtime_interrupt, const std::vector< int8_t * > &join_hash_tables, RenderAllocatorMap *render_allocator_map, bool optimize_cuda_block_and_grid_sizes)

+ Here is the call graph for this function:

ResultSetPtr Executor::executeTableFunction ( const TableFunctionExecutionUnit  exe_unit,
const std::vector< InputTableInfo > &  table_infos,
const CompilationOptions co,
const ExecutionOptions eo 
)
private

Compiles and dispatches a table function; that is, a function that takes as input one or more columns and returns a ResultSet, which can be parsed by subsequent execution steps.

Definition at line 2416 of file Execute.cpp.

References blockSize(), TableFunctionCompilationContext::compile(), table_functions::TableFunction::containsPreFlightFn(), CPU, CompilationOptions::device_type, TableFunctionExecutionContext::execute(), ResultSet::fixupQueryMemoryDescriptor(), getRowSetMemoryOwner(), GPU, gridSize(), table_functions::TableFunction::hasTableFunctionSpecifiedParameter(), INJECT_TIMER, ExecutionOptions::just_validate, CompilationOptions::makeCpuOnly(), query_mem_desc, TableFunctionExecutionUnit::table_func, TableFunction, TableFunctionExecutionUnit::target_exprs, and target_exprs_to_infos().

2420  {
2421  INJECT_TIMER(Exec_executeTableFunction);
2422  if (eo.just_validate) {
2424  /*entry_count=*/0,
2426  return std::make_shared<ResultSet>(
2428  co.device_type,
2430  this->getRowSetMemoryOwner(),
2431  this->blockSize(),
2432  this->gridSize());
2433  }
2434 
2435  // Avoid compile functions that set the sizer at runtime if the device is GPU
2436  // This should be fixed in the python script as well to minimize the number of
2437  // QueryMustRunOnCpu exceptions
2440  throw QueryMustRunOnCpu();
2441  }
2442 
2443  ColumnCacheMap column_cache; // Note: if we add retries to the table function
2444  // framework, we may want to move this up a level
2445 
2446  ColumnFetcher column_fetcher(this, column_cache);
2448 
2449  if (exe_unit.table_func.containsPreFlightFn()) {
2450  std::shared_ptr<CompilationContext> compilation_context;
2451  {
2452  Executor::CgenStateManager cgenstate_manager(*this,
2453  false,
2454  table_infos,
2456  nullptr); // locks compilation_mutex
2458  TableFunctionCompilationContext tf_compilation_context(this, pre_flight_co);
2459  compilation_context =
2460  tf_compilation_context.compile(exe_unit, true /* emit_only_preflight_fn*/);
2461  }
2462  exe_context.execute(exe_unit,
2463  table_infos,
2464  compilation_context,
2465  column_fetcher,
2467  this,
2468  true /* is_pre_launch_udtf */);
2469  }
2470  std::shared_ptr<CompilationContext> compilation_context;
2471  {
2472  Executor::CgenStateManager cgenstate_manager(*this,
2473  false,
2474  table_infos,
2476  nullptr); // locks compilation_mutex
2477  TableFunctionCompilationContext tf_compilation_context(this, co);
2478  compilation_context =
2479  tf_compilation_context.compile(exe_unit, false /* emit_only_preflight_fn */);
2480  }
2481  return exe_context.execute(exe_unit,
2482  table_infos,
2483  compilation_context,
2484  column_fetcher,
2485  co.device_type,
2486  this,
2487  false /* is_pre_launch_udtf */);
2488 }
std::unordered_map< shared::TableKey, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
const std::shared_ptr< RowSetMemoryOwner > getRowSetMemoryOwner() const
Definition: Execute.cpp:703
const table_functions::TableFunction table_func
static CompilationOptions makeCpuOnly(const CompilationOptions &in)
#define INJECT_TIMER(DESC)
Definition: measure.h:96
ExecutorDeviceType device_type
unsigned gridSize() const
Definition: Execute.cpp:4318
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::vector< Analyzer::Expr * > target_exprs
unsigned blockSize() const
Definition: Execute.cpp:4332

+ Here is the call graph for this function:

TableUpdateMetadata Executor::executeUpdate ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  table_infos,
const TableDescriptor updated_table_desc,
const CompilationOptions co,
const ExecutionOptions eo,
const Catalog_Namespace::Catalog cat,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const UpdateLogForFragment::Callback cb,
const bool  is_agg 
)

Definition at line 65 of file ExecuteUpdate.cpp.

References anonymous_namespace{Utm.h}::a, CHECK, CHECK_EQ, CHECK_GT, CPU, FragmentsPerTable::fragment_ids, g_enable_auto_metadata_update, SharedKernelContext::getFragmentResults(), SharedKernelContext::getFragOffsets(), Catalog_Namespace::Catalog::getMetadataForTable(), KernelPerFragment, query_mem_desc, ExecutionKernel::run(), TableDescriptor::tableId, timer_start(), timer_stop(), and VLOG.

74  {
75  CHECK(cb);
76  CHECK(table_desc_for_update);
77  VLOG(1) << "Executor " << executor_id_
78  << " is executing update/delete work unit:" << ra_exe_unit_in;
79 
80  const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
81  ColumnCacheMap column_cache;
82 
83  ColumnFetcher column_fetcher(this, column_cache);
84  CHECK_GT(ra_exe_unit.input_descs.size(), size_t(0));
85  const auto& outer_table_key = ra_exe_unit.input_descs[0].getTableKey();
86  CHECK_EQ(outer_table_key, table_infos.front().table_key);
87  const auto& outer_fragments = table_infos.front().info.fragments;
88 
89  std::vector<FragmentsPerTable> fragments = {{{0, 0}, {0}}};
90  for (size_t tab_idx = 1; tab_idx < ra_exe_unit.input_descs.size(); tab_idx++) {
91  const auto& table_key = ra_exe_unit.input_descs[tab_idx].getTableKey();
92  CHECK_EQ(table_infos[tab_idx].table_key, table_key);
93  const auto& fragmentsPerTable = table_infos[tab_idx].info.fragments;
94  FragmentsPerTable entry = {table_key, {}};
95  for (size_t innerFragId = 0; innerFragId < fragmentsPerTable.size(); innerFragId++) {
96  entry.fragment_ids.push_back(innerFragId);
97  }
98  fragments.push_back(entry);
99  }
100 
101  if (outer_fragments.empty()) {
102  return {};
103  }
104 
105  const auto max_tuple_count_fragment_it = std::max_element(
106  outer_fragments.begin(), outer_fragments.end(), [](const auto& a, const auto& b) {
107  return a.getNumTuples() < b.getNumTuples();
108  });
109  CHECK(max_tuple_count_fragment_it != outer_fragments.end());
110  int64_t global_max_groups_buffer_entry_guess =
111  max_tuple_count_fragment_it->getNumTuples();
112  if (is_agg) {
113  global_max_groups_buffer_entry_guess = std::min(
114  2 * global_max_groups_buffer_entry_guess, static_cast<int64_t>(100'000'000));
115  }
116 
117  auto query_comp_desc = std::make_unique<QueryCompilationDescriptor>();
118  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
119  {
120  query_mem_desc = query_comp_desc->compile(global_max_groups_buffer_entry_guess,
121  8,
122  /*has_cardinality_estimation=*/true,
123  ra_exe_unit,
124  table_infos,
125  deleted_cols_map,
126  column_fetcher,
127  co,
128  eo,
129  nullptr,
130  this);
131  }
132  CHECK(query_mem_desc);
133  // Since we execute updates one thread/fragment at a time,
134  // buffer re-use is not applicable and can cause issues
135  // when the contents of the output buffer are written to storage
136  query_mem_desc->setThreadsCanReuseGroupByBuffers(false);
137 
138  TableUpdateMetadata table_update_metadata;
139  for (size_t fragment_index = 0; fragment_index < outer_fragments.size();
140  ++fragment_index) {
141  const int64_t crt_fragment_tuple_count =
142  outer_fragments[fragment_index].getNumTuples();
143  if (crt_fragment_tuple_count == 0) {
144  // nothing to update
145  continue;
146  }
147  SharedKernelContext shared_context(table_infos);
148  const auto& frag_offsets = shared_context.getFragOffsets();
149  auto skip_frag = skipFragment(ra_exe_unit.input_descs[0],
150  outer_fragments[fragment_index],
151  ra_exe_unit.simple_quals,
152  frag_offsets,
153  fragment_index);
154  if (skip_frag.first) {
155  VLOG(2) << "Update/delete skipping fragment with table id: "
156  << outer_fragments[fragment_index].physicalTableId
157  << ", fragment id: " << fragment_index;
158  continue;
159  }
160  fragments[0] = {outer_table_key, {fragment_index}};
161 
162  {
163  ExecutionKernel current_fragment_kernel(ra_exe_unit,
165  0,
166  eo,
167  column_fetcher,
168  *query_comp_desc,
169  *query_mem_desc,
170  fragments,
172  /*render_info=*/nullptr,
173  /*rowid_lookup_key=*/-1);
174 
175  auto clock_begin = timer_start();
176  std::lock_guard<std::mutex> kernel_lock(kernel_mutex_);
177  kernel_queue_time_ms_ += timer_stop(clock_begin);
178 
179  current_fragment_kernel.run(this, 0, shared_context);
180  }
181  const auto& proj_fragment_results = shared_context.getFragmentResults();
182  if (proj_fragment_results.empty()) {
183  continue;
184  }
185  const auto& proj_fragment_result = proj_fragment_results[0];
186  const auto proj_result_set = proj_fragment_result.first;
187  CHECK(proj_result_set);
188  cb({outer_fragments[fragment_index], fragment_index, proj_result_set},
189  table_update_metadata);
190  }
191 
193  auto td = cat.getMetadataForTable(table_desc_for_update->tableId);
194  TableOptimizer table_optimizer{td, this, cat};
195  table_optimizer.recomputeMetadataUnlocked(table_update_metadata);
196  }
197  return table_update_metadata;
198 }
bool is_agg(const Analyzer::Expr *expr)
#define CHECK_EQ(x, y)
Definition: Logger.h:301
int64_t kernel_queue_time_ms_
Definition: Execute.h:1562
std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > addDeletedColumn(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
Definition: Execute.cpp:4441
std::vector< InputDescriptor > input_descs
Driver for running cleanup processes on a table. TableOptimizer provides functions for various cleanu...
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
bool g_enable_auto_metadata_update
static std::mutex kernel_mutex_
Definition: Execute.h:1641
#define CHECK_GT(x, y)
Definition: Logger.h:305
constexpr double a
Definition: Utm.h:32
const ExecutorId executor_id_
Definition: Execute.h:1476
std::pair< bool, int64_t > skipFragment(const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &frag_info, const std::list< std::shared_ptr< Analyzer::Expr >> &simple_quals, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
Definition: Execute.cpp:4624
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
#define CHECK(condition)
Definition: Logger.h:291
std::vector< size_t > fragment_ids
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
#define VLOG(n)
Definition: Logger.h:388
Type timer_start()
Definition: measure.h:42
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

ResultSetPtr Executor::executeWorkUnit ( size_t &  max_groups_buffer_entry_guess,
const bool  is_agg,
const std::vector< InputTableInfo > &  query_infos,
const RelAlgExecutionUnit ra_exe_unit_in,
const CompilationOptions co,
const ExecutionOptions options,
RenderInfo render_info,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache 
)

Definition at line 2074 of file Execute.cpp.

References cgen_state_, compilation_queue_time_ms_, executeWorkUnitImpl(), executor_id_, ExecutionOptions::just_validate, kernel_queue_time_ms_, CompilationRetryNewScanLimit::new_scan_limit_, plan_state_, anonymous_namespace{Execute.cpp}::replace_scan_limit(), run_benchmark_import::result, row_set_mem_owner_, and VLOG.

2082  {
2083  VLOG(1) << "Executor " << executor_id_ << " is executing work unit:" << ra_exe_unit_in;
2084  ScopeGuard cleanup_post_execution = [this] {
2085  // cleanup/unpin GPU buffer allocations
2086  // TODO: separate out this state into a single object
2087  plan_state_.reset(nullptr);
2088  if (cgen_state_) {
2089  cgen_state_->in_values_bitmaps_.clear();
2090  cgen_state_->str_dict_translation_mgrs_.clear();
2091  cgen_state_->tree_model_prediction_mgrs_.clear();
2092  }
2093  row_set_mem_owner_->clearNonOwnedGroupByBuffers();
2094  };
2095 
2096  try {
2097  auto result = executeWorkUnitImpl(max_groups_buffer_entry_guess,
2098  is_agg,
2099  true,
2100  query_infos,
2101  ra_exe_unit_in,
2102  co,
2103  eo,
2105  render_info,
2106  has_cardinality_estimation,
2107  column_cache);
2108  if (result) {
2109  result->setKernelQueueTime(kernel_queue_time_ms_);
2110  result->addCompilationQueueTime(compilation_queue_time_ms_);
2111  if (eo.just_validate) {
2112  result->setValidationOnlyRes();
2113  }
2114  }
2115  return result;
2116  } catch (const CompilationRetryNewScanLimit& e) {
2117  auto result =
2118  executeWorkUnitImpl(max_groups_buffer_entry_guess,
2119  is_agg,
2120  false,
2121  query_infos,
2122  replace_scan_limit(ra_exe_unit_in, e.new_scan_limit_),
2123  co,
2124  eo,
2126  render_info,
2127  has_cardinality_estimation,
2128  column_cache);
2129  if (result) {
2130  result->setKernelQueueTime(kernel_queue_time_ms_);
2131  result->addCompilationQueueTime(compilation_queue_time_ms_);
2132  if (eo.just_validate) {
2133  result->setValidationOnlyRes();
2134  }
2135  }
2136  return result;
2137  }
2138 }
bool is_agg(const Analyzer::Expr *expr)
int64_t kernel_queue_time_ms_
Definition: Execute.h:1562
int64_t compilation_queue_time_ms_
Definition: Execute.h:1563
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
const ExecutorId executor_id_
Definition: Execute.h:1476
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:1533
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
RelAlgExecutionUnit replace_scan_limit(const RelAlgExecutionUnit &ra_exe_unit_in, const size_t new_scan_limit)
Definition: Execute.cpp:2050
ResultSetPtr executeWorkUnitImpl(size_t &max_groups_buffer_entry_guess, const bool is_agg, const bool allow_single_frag_table_opt, const std::vector< InputTableInfo > &, const RelAlgExecutionUnit &, const CompilationOptions &, const ExecutionOptions &options, std::shared_ptr< RowSetMemoryOwner >, RenderInfo *render_info, const bool has_cardinality_estimation, ColumnCacheMap &column_cache)
Definition: Execute.cpp:2140
#define VLOG(n)
Definition: Logger.h:388

+ Here is the call graph for this function:

ResultSetPtr Executor::executeWorkUnitImpl ( size_t &  max_groups_buffer_entry_guess,
const bool  is_agg,
const bool  allow_single_frag_table_opt,
const std::vector< InputTableInfo > &  query_infos,
const RelAlgExecutionUnit ra_exe_unit_in,
const CompilationOptions co,
const ExecutionOptions options,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
RenderInfo render_info,
const bool  has_cardinality_estimation,
ColumnCacheMap column_cache 
)
private

Definition at line 2140 of file Execute.cpp.

References addDeletedColumn(), ExecutionOptions::allow_runtime_query_interrupt, blockSize(), CHECK, CHECK_EQ, checkIsQuerySessionEnrolled(), collectAllDeviceResults(), anonymous_namespace{Execute.cpp}::compute_buffer_entry_guess(), CPU, cpu_threads(), createKernels(), data_mgr_, CompilationOptions::device_type, ERR_INTERRUPTED, ERR_OUT_OF_SLOTS, ERR_OUT_OF_TIME, ERR_OVERFLOW_OR_UNDERFLOW, ExecutionOptions::estimate_output_cardinality, executeExplain(), executor_session_mutex_, ExecutionOptions::executor_type, ColumnFetcher::freeLinearizedBuf(), ColumnFetcher::freeTemporaryCpuLinearizedIdxBuf(), g_enable_executor_resource_mgr, get_available_gpus(), get_context_count(), getCurrentQuerySession(), getDeviceTypeForTargets(), QueryExecutionError::getErrorCode(), SharedKernelContext::getFragmentResults(), gridSize(), INJECT_TIMER, interrupted_, ExecutionOptions::just_explain, ExecutionOptions::just_validate, launchKernelsLocked(), launchKernelsViaResourceMgr(), MAX_BYTE_WIDTH_SUPPORTED, Native, RelAlgExecutionUnit::per_device_cardinality, plan_state_, Projection, QueryMemoryDescriptor, run_benchmark_import::result, resultsUnion(), QuerySessionStatus::RUNNING_REDUCTION, toString(), updateQuerySessionStatus(), VLOG, and ExecutionOptions::with_dynamic_watchdog.

Referenced by executeWorkUnit().

2151  {
2152  INJECT_TIMER(Exec_executeWorkUnit);
2153  const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
2154  const auto device_type = getDeviceTypeForTargets(ra_exe_unit, co.device_type);
2155  CHECK(!query_infos.empty());
2156  if (!max_groups_buffer_entry_guess) {
2157  // The query has failed the first execution attempt because of running out
2158  // of group by slots. Make the conservative choice: allocate fragment size
2159  // slots and run on the CPU.
2160  CHECK(device_type == ExecutorDeviceType::CPU);
2161  max_groups_buffer_entry_guess =
2162  compute_buffer_entry_guess(query_infos, ra_exe_unit_in);
2163  }
2164 
2165  int8_t crt_min_byte_width{MAX_BYTE_WIDTH_SUPPORTED};
2166  CompilationOptions copied_co = co;
2167  copied_co.device_type = device_type;
2168  do {
2169  SharedKernelContext shared_context(query_infos);
2170  ColumnFetcher column_fetcher(this, column_cache);
2171  ScopeGuard scope_guard = [&column_fetcher] {
2172  column_fetcher.freeLinearizedBuf();
2173  column_fetcher.freeTemporaryCpuLinearizedIdxBuf();
2174  };
2175  auto query_comp_desc_owned = std::make_unique<QueryCompilationDescriptor>();
2176  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc_owned;
2177  if (eo.executor_type == ExecutorType::Native) {
2178  try {
2179  INJECT_TIMER(query_step_compilation);
2180  query_mem_desc_owned =
2181  query_comp_desc_owned->compile(max_groups_buffer_entry_guess,
2182  crt_min_byte_width,
2183  has_cardinality_estimation,
2184  ra_exe_unit,
2185  query_infos,
2186  deleted_cols_map,
2187  column_fetcher,
2188  copied_co,
2189  eo,
2190  render_info,
2191  this);
2192  CHECK(query_mem_desc_owned);
2193  crt_min_byte_width = query_comp_desc_owned->getMinByteWidth();
2194  } catch (CompilationRetryNoCompaction& e) {
2195  VLOG(1) << e.what();
2196  crt_min_byte_width = MAX_BYTE_WIDTH_SUPPORTED;
2197  continue;
2198  }
2199  } else {
2200  plan_state_.reset(new PlanState(false, query_infos, deleted_cols_map, this));
2201  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2202  CHECK(!query_mem_desc_owned);
2203  query_mem_desc_owned.reset(
2205  }
2206  if (eo.just_explain) {
2207  return executeExplain(*query_comp_desc_owned);
2208  }
2209 
2210  if (query_mem_desc_owned->canUsePerDeviceCardinality(ra_exe_unit)) {
2211  auto const max_rows_per_device =
2212  query_mem_desc_owned->getMaxPerDeviceCardinality(ra_exe_unit);
2213  if (max_rows_per_device && *max_rows_per_device >= 0 &&
2214  *max_rows_per_device < query_mem_desc_owned->getEntryCount()) {
2215  VLOG(1) << "Setting the max per device cardinality of {max_rows_per_device} as "
2216  "the new scan limit: "
2217  << *max_rows_per_device;
2218  throw CompilationRetryNewScanLimit(*max_rows_per_device);
2219  }
2220  }
2221 
2222  if (!eo.just_validate) {
2223  int available_cpus = cpu_threads();
2224  auto available_gpus = get_available_gpus(data_mgr_);
2225 
2226  const auto context_count =
2227  get_context_count(device_type, available_cpus, available_gpus.size());
2228  try {
2229  auto kernels = createKernels(shared_context,
2230  ra_exe_unit,
2231  column_fetcher,
2232  query_infos,
2233  eo,
2234  is_agg,
2235  allow_single_frag_table_opt,
2236  context_count,
2237  *query_comp_desc_owned,
2238  *query_mem_desc_owned,
2239  render_info,
2240  available_gpus,
2241  available_cpus);
2243  launchKernelsViaResourceMgr(shared_context,
2244  std::move(kernels),
2245  query_comp_desc_owned->getDeviceType(),
2246  ra_exe_unit.input_descs,
2247  *query_mem_desc_owned);
2248  } else {
2250  shared_context, std::move(kernels), query_comp_desc_owned->getDeviceType());
2251  }
2252 
2253  } catch (QueryExecutionError& e) {
2254  if (eo.with_dynamic_watchdog && interrupted_.load() &&
2255  e.getErrorCode() == ERR_OUT_OF_TIME) {
2257  }
2258  if (e.getErrorCode() == ERR_INTERRUPTED) {
2260  }
2262  static_cast<size_t>(crt_min_byte_width << 1) <= sizeof(int64_t)) {
2263  crt_min_byte_width <<= 1;
2264  continue;
2265  }
2266  throw;
2267  }
2268  }
2269  if (is_agg) {
2270  if (eo.allow_runtime_query_interrupt && ra_exe_unit.query_state) {
2271  // update query status to let user know we are now in the reduction phase
2272  std::string curRunningSession{""};
2273  std::string curRunningQuerySubmittedTime{""};
2274  bool sessionEnrolled = false;
2275  {
2278  curRunningSession = getCurrentQuerySession(session_read_lock);
2279  curRunningQuerySubmittedTime = ra_exe_unit.query_state->getQuerySubmittedTime();
2280  sessionEnrolled =
2281  checkIsQuerySessionEnrolled(curRunningSession, session_read_lock);
2282  }
2283  if (!curRunningSession.empty() && !curRunningQuerySubmittedTime.empty() &&
2284  sessionEnrolled) {
2285  updateQuerySessionStatus(curRunningSession,
2286  curRunningQuerySubmittedTime,
2288  }
2289  }
2290  try {
2291  if (eo.estimate_output_cardinality) {
2292  for (const auto& result : shared_context.getFragmentResults()) {
2293  auto row = result.first->getNextRow(false, false);
2294  CHECK_EQ(1u, row.size());
2295  auto scalar_r = boost::get<ScalarTargetValue>(&row[0]);
2296  CHECK(scalar_r);
2297  auto p = boost::get<int64_t>(scalar_r);
2298  CHECK(p);
2299  // todo(yoonmin): sort the frag_ids to make it consistent for later usage
2300  auto frag_ids = result.second;
2301  VLOG(1) << "Filtered cardinality for fragments-{" << ::toString(result.second)
2302  << "} : " << static_cast<size_t>(*p);
2303  ra_exe_unit_in.per_device_cardinality.emplace_back(result.second,
2304  static_cast<size_t>(*p));
2305  result.first->moveToBegin();
2306  }
2307  }
2308  return collectAllDeviceResults(shared_context,
2309  ra_exe_unit,
2310  *query_mem_desc_owned,
2311  query_comp_desc_owned->getDeviceType(),
2312  row_set_mem_owner);
2313  } catch (ReductionRanOutOfSlots&) {
2315  } catch (OverflowOrUnderflow&) {
2316  crt_min_byte_width <<= 1;
2317  continue;
2318  } catch (QueryExecutionError& e) {
2319  VLOG(1) << "Error received! error_code: " << e.getErrorCode()
2320  << ", what(): " << e.what();
2321  throw QueryExecutionError(e.getErrorCode());
2322  }
2323  }
2324  return resultsUnion(shared_context, ra_exe_unit);
2325 
2326  } while (static_cast<size_t>(crt_min_byte_width) <= sizeof(int64_t));
2327 
2328  return std::make_shared<ResultSet>(std::vector<TargetInfo>{},
2331  nullptr,
2332  blockSize(),
2333  gridSize());
2334 }
bool is_agg(const Analyzer::Expr *expr)
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::vector< std::unique_ptr< ExecutionKernel > > createKernels(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, ColumnFetcher &column_fetcher, const std::vector< InputTableInfo > &table_infos, const ExecutionOptions &eo, const bool is_agg, const bool allow_single_frag_table_opt, const size_t context_count, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, RenderInfo *render_info, std::unordered_set< int > &available_gpus, int &available_cpus)
Definition: Execute.cpp:2874
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1558
int32_t getErrorCode() const
Definition: ErrorHandling.h:55
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
std::atomic< bool > interrupted_
Definition: Execute.h:1543
void updateQuerySessionStatus(const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus new_query_status)
Definition: Execute.cpp:5041
std::unordered_set< int > get_available_gpus(const Data_Namespace::DataMgr *data_mgr)
Definition: Execute.cpp:1727
std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > addDeletedColumn(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
Definition: Execute.cpp:4441
std::string toString(const QueryDescriptionType &type)
Definition: Types.h:64
std::shared_lock< T > shared_lock
size_t compute_buffer_entry_guess(const std::vector< InputTableInfo > &query_infos, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:1753
bool g_enable_executor_resource_mgr
Definition: Execute.cpp:174
ResultSetPtr collectAllDeviceResults(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
Definition: Execute.cpp:2682
std::vector< std::pair< std::vector< size_t >, size_t > > per_device_cardinality
size_t get_context_count(const ExecutorDeviceType device_type, const size_t cpu_count, const size_t gpu_count)
Definition: Execute.cpp:1741
#define INJECT_TIMER(DESC)
Definition: measure.h:96
friend class QueryMemoryDescriptor
Definition: Execute.h:1658
static const int32_t ERR_OVERFLOW_OR_UNDERFLOW
Definition: Execute.h:1621
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1622
bool checkIsQuerySessionEnrolled(const QuerySessionId &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:5231
ExecutorDeviceType device_type
void launchKernelsLocked(SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type)
Definition: Execute.cpp:3091
ResultSetPtr resultsUnion(SharedKernelContext &shared_context, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:1538
QuerySessionId & getCurrentQuerySession(heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:4952
unsigned gridSize() const
Definition: Execute.cpp:4318
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
#define CHECK(condition)
Definition: Logger.h:291
static const int32_t ERR_OUT_OF_SLOTS
Definition: Execute.h:1617
constexpr int8_t MAX_BYTE_WIDTH_SUPPORTED
ExecutorDeviceType getDeviceTypeForTargets(const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType requested_device_type)
Definition: Execute.cpp:2546
void launchKernelsViaResourceMgr(SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type, const std::vector< InputDescriptor > &input_descs, const QueryMemoryDescriptor &query_mem_desc)
Launches a vector of kernels for a given query step, gated/scheduled by ExecutorResourceMgr.
Definition: Execute.cpp:3103
unsigned blockSize() const
Definition: Execute.cpp:4332
int cpu_threads()
Definition: thread_count.h:25
#define VLOG(n)
Definition: Logger.h:388
ResultSetPtr executeExplain(const QueryCompilationDescriptor &)
Definition: Execute.cpp:2490

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::executeWorkUnitPerFragment ( const RelAlgExecutionUnit ra_exe_unit,
const InputTableInfo table_info,
const CompilationOptions co,
const ExecutionOptions eo,
const Catalog_Namespace::Catalog cat,
PerFragmentCallBack cb,
const std::set< size_t > &  fragment_indexes_param 
)
private

Compiles and dispatches a work unit per fragment processing results with the per fragment callback. Currently used for computing metrics over fragments (metadata).

Definition at line 2336 of file Execute.cpp.

References addDeletedColumn(), CHECK, CHECK_EQ, CompilationOptions::device_type, Fragmenter_Namespace::TableInfo::fragments, SharedKernelContext::getFragmentResults(), InputTableInfo::info, kernel_mutex_, kernel_queue_time_ms_, KernelPerFragment, ExecutionKernel::run(), timer_start(), and timer_stop().

2343  {
2344  const auto [ra_exe_unit, deleted_cols_map] = addDeletedColumn(ra_exe_unit_in, co);
2345  ColumnCacheMap column_cache;
2346 
2347  std::vector<InputTableInfo> table_infos{table_info};
2348  SharedKernelContext kernel_context(table_infos);
2349 
2350  ColumnFetcher column_fetcher(this, column_cache);
2351  auto query_comp_desc_owned = std::make_unique<QueryCompilationDescriptor>();
2352  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc_owned;
2353  {
2354  query_mem_desc_owned =
2355  query_comp_desc_owned->compile(0,
2356  8,
2357  /*has_cardinality_estimation=*/false,
2358  ra_exe_unit,
2359  table_infos,
2360  deleted_cols_map,
2361  column_fetcher,
2362  co,
2363  eo,
2364  nullptr,
2365  this);
2366  }
2367  CHECK(query_mem_desc_owned);
2368  CHECK_EQ(size_t(1), ra_exe_unit.input_descs.size());
2369  const auto table_key = ra_exe_unit.input_descs[0].getTableKey();
2370  const auto& outer_fragments = table_info.info.fragments;
2371 
2372  std::set<size_t> fragment_indexes;
2373  if (fragment_indexes_param.empty()) {
2374  // An empty `fragment_indexes_param` set implies executing
2375  // the query for all fragments in the table. In this
2376  // case, populate `fragment_indexes` with all fragment indexes.
2377  for (size_t i = 0; i < outer_fragments.size(); i++) {
2378  fragment_indexes.emplace(i);
2379  }
2380  } else {
2381  fragment_indexes = fragment_indexes_param;
2382  }
2383 
2384  {
2385  auto clock_begin = timer_start();
2386  std::lock_guard<std::mutex> kernel_lock(kernel_mutex_);
2387  kernel_queue_time_ms_ += timer_stop(clock_begin);
2388 
2389  for (auto fragment_index : fragment_indexes) {
2390  // We may want to consider in the future allowing this to execute on devices other
2391  // than CPU
2392  FragmentsList fragments_list{{table_key, {fragment_index}}};
2393  ExecutionKernel kernel(ra_exe_unit,
2394  co.device_type,
2395  /*device_id=*/0,
2396  eo,
2397  column_fetcher,
2398  *query_comp_desc_owned,
2399  *query_mem_desc_owned,
2400  fragments_list,
2402  /*render_info=*/nullptr,
2403  /*rowid_lookup_key=*/-1);
2404  kernel.run(this, 0, kernel_context);
2405  }
2406  }
2407 
2408  const auto& all_fragment_results = kernel_context.getFragmentResults();
2409 
2410  for (const auto& [result_set_ptr, result_fragment_indexes] : all_fragment_results) {
2411  CHECK_EQ(result_fragment_indexes.size(), 1);
2412  cb(result_set_ptr, outer_fragments[result_fragment_indexes[0]]);
2413  }
2414 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
int64_t kernel_queue_time_ms_
Definition: Execute.h:1562
Fragmenter_Namespace::TableInfo info
Definition: InputMetadata.h:35
std::tuple< RelAlgExecutionUnit, PlanState::DeletedColumnsMap > addDeletedColumn(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
Definition: Execute.cpp:4441
std::vector< InputDescriptor > input_descs
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
std::vector< FragmentInfo > fragments
Definition: Fragmenter.h:171
static std::mutex kernel_mutex_
Definition: Execute.h:1641
std::vector< FragmentsPerTable > FragmentsList
ExecutorDeviceType device_type
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
#define CHECK(condition)
Definition: Logger.h:291
Type timer_start()
Definition: measure.h:42

+ Here is the call graph for this function:

FetchResult Executor::fetchChunks ( const ColumnFetcher column_fetcher,
const RelAlgExecutionUnit ra_exe_unit,
const int  device_id,
const Data_Namespace::MemoryLevel  memory_level,
const std::map< shared::TableKey, const TableFragments * > &  all_tables_fragments,
const FragmentsList selected_fragments,
std::list< ChunkIter > &  chunk_iterators,
std::list< std::shared_ptr< Chunk_NS::Chunk >> &  chunks,
DeviceAllocator device_allocator,
const size_t  thread_idx,
const bool  allow_runtime_interrupt 
)
private

Definition at line 3426 of file Execute.cpp.

References buildSelectedFragsMapping(), CHECK, CHECK_EQ, CHECK_LT, checkIsQuerySessionInterrupted(), Data_Namespace::CPU_LEVEL, DEBUG_TIMER, ERR_INTERRUPTED, executor_session_mutex_, g_enable_dynamic_watchdog, ColumnFetcher::getAllTableColumnFragments(), getCurrentQuerySession(), ColumnFetcher::getOneTableColumnFragment(), ColumnFetcher::getResultSetColumn(), getRowCountAndOffsetForAllFrags(), INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, interrupted_, ColumnFetcher::linearizeColumnFragments(), needFetchAllFragments(), needLinearizeAllFragments(), plan_state_, RESULT, anonymous_namespace{Execute.cpp}::try_get_column_descriptor(), and VLOG.

3437  {
3438  auto timer = DEBUG_TIMER(__func__);
3440  const auto& col_global_ids = ra_exe_unit.input_col_descs;
3441  std::vector<std::vector<size_t>> selected_fragments_crossjoin;
3442  std::vector<size_t> local_col_to_frag_pos;
3443  buildSelectedFragsMapping(selected_fragments_crossjoin,
3444  local_col_to_frag_pos,
3445  col_global_ids,
3446  selected_fragments,
3447  ra_exe_unit);
3448 
3450  selected_fragments_crossjoin);
3451  std::vector<std::vector<const int8_t*>> all_frag_col_buffers;
3452  std::vector<std::vector<int64_t>> all_num_rows;
3453  std::vector<std::vector<uint64_t>> all_frag_offsets;
3454  for (const auto& selected_frag_ids : frag_ids_crossjoin) {
3455  std::vector<const int8_t*> frag_col_buffers(
3456  plan_state_->global_to_local_col_ids_.size());
3457  for (const auto& col_id : col_global_ids) {
3458  if (allow_runtime_interrupt) {
3459  bool isInterrupted = false;
3460  {
3463  const auto query_session = getCurrentQuerySession(session_read_lock);
3464  isInterrupted =
3465  checkIsQuerySessionInterrupted(query_session, session_read_lock);
3466  }
3467  if (isInterrupted) {
3469  }
3470  }
3471  if (g_enable_dynamic_watchdog && interrupted_.load()) {
3473  }
3474  CHECK(col_id);
3475  const auto cd = try_get_column_descriptor(col_id.get());
3476  if (cd && cd->isVirtualCol) {
3477  CHECK_EQ("rowid", cd->columnName);
3478  continue;
3479  }
3480  const auto& table_key = col_id->getScanDesc().getTableKey();
3481  const auto fragments_it = all_tables_fragments.find(table_key);
3482  CHECK(fragments_it != all_tables_fragments.end());
3483  const auto fragments = fragments_it->second;
3484  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
3485  CHECK(it != plan_state_->global_to_local_col_ids_.end());
3486  CHECK_LT(static_cast<size_t>(it->second),
3487  plan_state_->global_to_local_col_ids_.size());
3488  const size_t frag_id = selected_frag_ids[local_col_to_frag_pos[it->second]];
3489  if (!fragments->size()) {
3490  return {};
3491  }
3492  CHECK_LT(frag_id, fragments->size());
3493  auto memory_level_for_column = memory_level;
3494  const shared::ColumnKey tbl_col_key{col_id->getScanDesc().getTableKey(),
3495  col_id->getColId()};
3496  if (!plan_state_->isColumnToFetch(tbl_col_key)) {
3497  memory_level_for_column = Data_Namespace::CPU_LEVEL;
3498  }
3499  if (col_id->getScanDesc().getSourceType() == InputSourceType::RESULT) {
3500  frag_col_buffers[it->second] =
3501  column_fetcher.getResultSetColumn(col_id.get(),
3502  memory_level_for_column,
3503  device_id,
3504  device_allocator,
3505  thread_idx);
3506  } else {
3507  if (needFetchAllFragments(*col_id, ra_exe_unit, selected_fragments)) {
3508  // determine if we need special treatment to linearlize multi-frag table
3509  // i.e., a column that is classified as varlen type, i.e., array
3510  // for now, we only support fixed-length array that contains
3511  // geo point coordianates but we can support more types in this way
3513  cd, *col_id, ra_exe_unit, selected_fragments, memory_level)) {
3514  bool for_lazy_fetch = false;
3515  if (plan_state_->isColumnToNotFetch(tbl_col_key)) {
3516  for_lazy_fetch = true;
3517  VLOG(2) << "Try to linearize lazy fetch column (col_id: " << cd->columnId
3518  << ", col_name: " << cd->columnName << ")";
3519  }
3520  frag_col_buffers[it->second] = column_fetcher.linearizeColumnFragments(
3521  col_id->getScanDesc().getTableKey(),
3522  col_id->getColId(),
3523  all_tables_fragments,
3524  chunks,
3525  chunk_iterators,
3526  for_lazy_fetch ? Data_Namespace::CPU_LEVEL : memory_level,
3527  for_lazy_fetch ? 0 : device_id,
3528  device_allocator,
3529  thread_idx);
3530  } else {
3531  frag_col_buffers[it->second] = column_fetcher.getAllTableColumnFragments(
3532  col_id->getScanDesc().getTableKey(),
3533  col_id->getColId(),
3534  all_tables_fragments,
3535  memory_level_for_column,
3536  device_id,
3537  device_allocator,
3538  thread_idx);
3539  }
3540  } else {
3541  frag_col_buffers[it->second] = column_fetcher.getOneTableColumnFragment(
3542  col_id->getScanDesc().getTableKey(),
3543  frag_id,
3544  col_id->getColId(),
3545  all_tables_fragments,
3546  chunks,
3547  chunk_iterators,
3548  memory_level_for_column,
3549  device_id,
3550  device_allocator);
3551  }
3552  }
3553  }
3554  all_frag_col_buffers.push_back(frag_col_buffers);
3555  }
3556  std::tie(all_num_rows, all_frag_offsets) = getRowCountAndOffsetForAllFrags(
3557  ra_exe_unit, frag_ids_crossjoin, ra_exe_unit.input_descs, all_tables_fragments);
3558  return {all_frag_col_buffers, all_num_rows, all_frag_offsets};
3559 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
bool checkIsQuerySessionInterrupted(const std::string &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:5220
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
std::atomic< bool > interrupted_
Definition: Execute.h:1543
std::vector< InputDescriptor > input_descs
const int8_t * getResultSetColumn(const InputColDescriptor *col_desc, const Data_Namespace::MemoryLevel memory_level, const int device_id, DeviceAllocator *device_allocator, const size_t thread_idx) const
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:81
const int8_t * getOneTableColumnFragment(const shared::TableKey &table_key, const int frag_id, const int col_id, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments, std::list< std::shared_ptr< Chunk_NS::Chunk >> &chunk_holder, std::list< ChunkIter > &chunk_iter_holder, const Data_Namespace::MemoryLevel memory_level, const int device_id, DeviceAllocator *device_allocator) const
bool needFetchAllFragments(const InputColDescriptor &col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments) const
Definition: Execute.cpp:3384
const int8_t * getAllTableColumnFragments(const shared::TableKey &table_key, const int col_id, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments, const Data_Namespace::MemoryLevel memory_level, const int device_id, DeviceAllocator *device_allocator, const size_t thread_idx) const
std::shared_lock< T > shared_lock
std::pair< std::vector< std::vector< int64_t > >, std::vector< std::vector< uint64_t > > > getRowCountAndOffsetForAllFrags(const RelAlgExecutionUnit &ra_exe_unit, const CartesianProduct< std::vector< std::vector< size_t >>> &frag_ids_crossjoin, const std::vector< InputDescriptor > &input_descs, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments)
Definition: Execute.cpp:3335
#define INJECT_TIMER(DESC)
Definition: measure.h:96
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define CHECK_LT(x, y)
Definition: Logger.h:303
QuerySessionId & getCurrentQuerySession(heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:4952
const int8_t * linearizeColumnFragments(const shared::TableKey &table_key, const int col_id, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments, std::list< std::shared_ptr< Chunk_NS::Chunk >> &chunk_holder, std::list< ChunkIter > &chunk_iter_holder, const Data_Namespace::MemoryLevel memory_level, const int device_id, DeviceAllocator *device_allocator, const size_t thread_idx) const
bool needLinearizeAllFragments(const ColumnDescriptor *cd, const InputColDescriptor &inner_col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments, const Data_Namespace::MemoryLevel memory_level) const
Definition: Execute.cpp:3403
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
void buildSelectedFragsMapping(std::vector< std::vector< size_t >> &selected_fragments_crossjoin, std::vector< size_t > &local_col_to_frag_pos, const std::list< std::shared_ptr< const InputColDescriptor >> &col_global_ids, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:3742
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
const ColumnDescriptor * try_get_column_descriptor(const InputColDescriptor *col_desc)
Definition: Execute.cpp:3308
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
#define VLOG(n)
Definition: Logger.h:388
FetchResult fetchChunks(const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< shared::TableKey, const TableFragments * > &, const FragmentsList &selected_fragments, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)
Definition: Execute.cpp:3426

+ Here is the call graph for this function:

FetchResult Executor::fetchUnionChunks ( const ColumnFetcher column_fetcher,
const RelAlgExecutionUnit ra_exe_unit,
const int  device_id,
const Data_Namespace::MemoryLevel  memory_level,
const std::map< shared::TableKey, const TableFragments * > &  all_tables_fragments,
const FragmentsList selected_fragments,
std::list< ChunkIter > &  chunk_iterators,
std::list< std::shared_ptr< Chunk_NS::Chunk >> &  chunks,
DeviceAllocator device_allocator,
const size_t  thread_idx,
const bool  allow_runtime_interrupt 
)
private

Definition at line 3610 of file Execute.cpp.

References buildSelectedFragsMappingForUnion(), CHECK, CHECK_EQ, CHECK_LE, CHECK_LT, checkIsQuerySessionInterrupted(), Data_Namespace::CPU_LEVEL, DEBUG_TIMER, ERR_INTERRUPTED, executor_session_mutex_, anonymous_namespace{Execute.cpp}::get_selected_input_col_descs(), anonymous_namespace{Execute.cpp}::get_selected_input_col_descs_index(), anonymous_namespace{Execute.cpp}::get_selected_input_descs_index(), ColumnFetcher::getAllTableColumnFragments(), getCurrentQuerySession(), ColumnFetcher::getOneTableColumnFragment(), ColumnFetcher::getResultSetColumn(), getRowCountAndOffsetForAllFrags(), INJECT_TIMER, RelAlgExecutionUnit::input_col_descs, RelAlgExecutionUnit::input_descs, needFetchAllFragments(), plan_state_, shared::printContainer(), RESULT, anonymous_namespace{Execute.cpp}::set_mod_range(), anonymous_namespace{Execute.cpp}::try_get_column_descriptor(), and VLOG.

3621  {
3622  auto timer = DEBUG_TIMER(__func__);
3624 
3625  CHECK_EQ(1u, selected_fragments.size());
3626  CHECK_LE(2u, ra_exe_unit.input_descs.size());
3627  CHECK_LE(2u, ra_exe_unit.input_col_descs.size());
3628  auto const& input_descs = ra_exe_unit.input_descs;
3629  const auto& selected_table_key = selected_fragments.front().table_key;
3630  size_t const input_descs_index =
3631  get_selected_input_descs_index(selected_table_key, input_descs);
3632  CHECK_LT(input_descs_index, input_descs.size());
3633  size_t const input_col_descs_index =
3634  get_selected_input_col_descs_index(selected_table_key, ra_exe_unit.input_col_descs);
3635  CHECK_LT(input_col_descs_index, ra_exe_unit.input_col_descs.size());
3636  VLOG(2) << "selected_table_key=" << selected_table_key
3637  << " input_descs_index=" << input_descs_index
3638  << " input_col_descs_index=" << input_col_descs_index
3639  << " input_descs=" << shared::printContainer(input_descs)
3640  << " ra_exe_unit.input_col_descs="
3641  << shared::printContainer(ra_exe_unit.input_col_descs);
3642 
3643  std::list<std::shared_ptr<const InputColDescriptor>> selected_input_col_descs =
3644  get_selected_input_col_descs(selected_table_key, ra_exe_unit.input_col_descs);
3645  std::vector<std::vector<size_t>> selected_fragments_crossjoin;
3646 
3648  selected_fragments_crossjoin, selected_fragments, ra_exe_unit);
3649 
3651  selected_fragments_crossjoin);
3652 
3653  if (allow_runtime_interrupt) {
3654  bool isInterrupted = false;
3655  {
3658  const auto query_session = getCurrentQuerySession(session_read_lock);
3659  isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
3660  }
3661  if (isInterrupted) {
3663  }
3664  }
3665  std::vector<const int8_t*> frag_col_buffers(
3666  plan_state_->global_to_local_col_ids_.size());
3667  for (const auto& col_id : selected_input_col_descs) {
3668  CHECK(col_id);
3669  const auto cd = try_get_column_descriptor(col_id.get());
3670  if (cd && cd->isVirtualCol) {
3671  CHECK_EQ("rowid", cd->columnName);
3672  continue;
3673  }
3674  const auto fragments_it = all_tables_fragments.find(selected_table_key);
3675  CHECK(fragments_it != all_tables_fragments.end());
3676  const auto fragments = fragments_it->second;
3677  auto it = plan_state_->global_to_local_col_ids_.find(*col_id);
3678  CHECK(it != plan_state_->global_to_local_col_ids_.end());
3679  size_t const local_col_id = it->second;
3680  CHECK_LT(local_col_id, plan_state_->global_to_local_col_ids_.size());
3681  constexpr size_t frag_id = 0;
3682  if (fragments->empty()) {
3683  return {};
3684  }
3685  MemoryLevel const memory_level_for_column =
3686  plan_state_->isColumnToFetch({selected_table_key, col_id->getColId()})
3687  ? memory_level
3689  int8_t const* ptr;
3690  if (col_id->getScanDesc().getSourceType() == InputSourceType::RESULT) {
3691  ptr = column_fetcher.getResultSetColumn(
3692  col_id.get(), memory_level_for_column, device_id, device_allocator, thread_idx);
3693  } else if (needFetchAllFragments(*col_id, ra_exe_unit, selected_fragments)) {
3694  ptr = column_fetcher.getAllTableColumnFragments(selected_table_key,
3695  col_id->getColId(),
3696  all_tables_fragments,
3697  memory_level_for_column,
3698  device_id,
3699  device_allocator,
3700  thread_idx);
3701  } else {
3702  ptr = column_fetcher.getOneTableColumnFragment(selected_table_key,
3703  frag_id,
3704  col_id->getColId(),
3705  all_tables_fragments,
3706  chunks,
3707  chunk_iterators,
3708  memory_level_for_column,
3709  device_id,
3710  device_allocator);
3711  }
3712  // Set frag_col_buffers[i]=ptr for i in mod input_descs.size() range of local_col_id.
3713  set_mod_range(frag_col_buffers, ptr, local_col_id, input_descs.size());
3714  }
3715  auto const [num_rows, frag_offsets] = getRowCountAndOffsetForAllFrags(
3716  ra_exe_unit, frag_ids_crossjoin, input_descs, all_tables_fragments);
3717 
3718  VLOG(2) << "frag_col_buffers=" << shared::printContainer(frag_col_buffers)
3719  << " num_rows=" << shared::printContainer(num_rows)
3720  << " frag_offsets=" << shared::printContainer(frag_offsets)
3721  << " input_descs_index=" << input_descs_index
3722  << " input_col_descs_index=" << input_col_descs_index;
3723  return {{std::move(frag_col_buffers)},
3724  {{num_rows[0][input_descs_index]}},
3725  {{frag_offsets[0][input_descs_index]}}};
3726 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
bool checkIsQuerySessionInterrupted(const std::string &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:5220
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
void set_mod_range(std::vector< int8_t const * > &frag_col_buffers, int8_t const *const ptr, size_t const local_col_id, size_t const N)
Definition: Execute.cpp:3595
std::vector< InputDescriptor > input_descs
const int8_t * getResultSetColumn(const InputColDescriptor *col_desc, const Data_Namespace::MemoryLevel memory_level, const int device_id, DeviceAllocator *device_allocator, const size_t thread_idx) const
FetchResult fetchUnionChunks(const ColumnFetcher &, const RelAlgExecutionUnit &ra_exe_unit, const int device_id, const Data_Namespace::MemoryLevel, const std::map< shared::TableKey, const TableFragments * > &, const FragmentsList &selected_fragments, std::list< ChunkIter > &, std::list< std::shared_ptr< Chunk_NS::Chunk >> &, DeviceAllocator *device_allocator, const size_t thread_idx, const bool allow_runtime_interrupt)
Definition: Execute.cpp:3610
const int8_t * getOneTableColumnFragment(const shared::TableKey &table_key, const int frag_id, const int col_id, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments, std::list< std::shared_ptr< Chunk_NS::Chunk >> &chunk_holder, std::list< ChunkIter > &chunk_iter_holder, const Data_Namespace::MemoryLevel memory_level, const int device_id, DeviceAllocator *device_allocator) const
bool needFetchAllFragments(const InputColDescriptor &col_desc, const RelAlgExecutionUnit &ra_exe_unit, const FragmentsList &selected_fragments) const
Definition: Execute.cpp:3384
const int8_t * getAllTableColumnFragments(const shared::TableKey &table_key, const int col_id, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments, const Data_Namespace::MemoryLevel memory_level, const int device_id, DeviceAllocator *device_allocator, const size_t thread_idx) const
std::shared_lock< T > shared_lock
std::pair< std::vector< std::vector< int64_t > >, std::vector< std::vector< uint64_t > > > getRowCountAndOffsetForAllFrags(const RelAlgExecutionUnit &ra_exe_unit, const CartesianProduct< std::vector< std::vector< size_t >>> &frag_ids_crossjoin, const std::vector< InputDescriptor > &input_descs, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments)
Definition: Execute.cpp:3335
#define INJECT_TIMER(DESC)
Definition: measure.h:96
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
size_t get_selected_input_descs_index(const shared::TableKey &table_key, std::vector< InputDescriptor > const &input_descs)
Definition: Execute.cpp:3562
size_t get_selected_input_col_descs_index(const shared::TableKey &table_key, std::list< std::shared_ptr< InputColDescriptor const >> const &input_col_descs)
Definition: Execute.cpp:3571
#define CHECK_LT(x, y)
Definition: Logger.h:303
QuerySessionId & getCurrentQuerySession(heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:4952
#define CHECK_LE(x, y)
Definition: Logger.h:304
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
void buildSelectedFragsMappingForUnion(std::vector< std::vector< size_t >> &selected_fragments_crossjoin, const FragmentsList &selected_fragments, const RelAlgExecutionUnit &ra_exe_unit)
Definition: Execute.cpp:3773
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
const ColumnDescriptor * try_get_column_descriptor(const InputColDescriptor *col_desc)
Definition: Execute.cpp:3308
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:107
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
std::list< std::shared_ptr< const InputColDescriptor > > get_selected_input_col_descs(const shared::TableKey &table_key, std::list< std::shared_ptr< InputColDescriptor const >> const &input_col_descs)
Definition: Execute.cpp:3582
#define VLOG(n)
Definition: Logger.h:388

+ Here is the call graph for this function:

std::string Executor::generatePTX ( const std::string &  cuda_llir) const
private

Definition at line 1544 of file NativeCodegen.cpp.

1544  {
1546  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1547 }
std::unique_ptr< llvm::TargetMachine > nvptx_target_machine_
Definition: Execute.h:1547
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
const ExecutorResourceMgr_Namespace::ConcurrentResourceGrantPolicy Executor::get_concurrent_resource_grant_policy ( const ExecutorResourceMgr_Namespace::ResourceType  resource_type)
static

Definition at line 5433 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

5434  {
5436  throw std::runtime_error(
5437  "ExecutorResourceMgr must be enabled to set executor concurrent resource grant "
5438  "policy.");
5439  }
5440  return executor_resource_mgr_->get_concurrent_resource_grant_policy(resource_type);
5441 }
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645
bool g_enable_executor_resource_mgr
Definition: Execute.cpp:174
ExecutorResourceMgr_Namespace::ResourcePoolInfo Executor::get_executor_resource_pool_info ( )
static

Definition at line 5414 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

Referenced by foreign_storage::InternalExecutorStatsDataWrapper::initializeObjectsForTable().

5414  {
5416  throw std::runtime_error(
5417  "ExecutorResourceMgr must be enabled to obtain executor resource pool stats.");
5418  }
5419  return executor_resource_mgr_->get_resource_info();
5420 }
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645
bool g_enable_executor_resource_mgr
Definition: Execute.cpp:174

+ Here is the caller graph for this function:

size_t Executor::get_executor_resource_pool_total_resource_quantity ( const ExecutorResourceMgr_Namespace::ResourceType  resource_type)
static

Definition at line 5404 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

5405  {
5407  throw std::runtime_error(
5408  "ExecutorResourceMgr must be enabled to obtain executor resource pool stats.");
5409  }
5410  return executor_resource_mgr_->get_resource_info(resource_type).second;
5411 }
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645
bool g_enable_executor_resource_mgr
Definition: Execute.cpp:174
const std::unique_ptr<llvm::Module>& Executor::get_extension_module ( ExtModuleKinds  kind) const
inlineprivate

Definition at line 1504 of file Execute.h.

References extension_modules_.

Referenced by get_geos_module(), get_libdevice_module(), get_rt_module(), get_rt_udf_module(), and get_udf_module().

1504  {
1505  auto it = extension_modules_.find(kind);
1506  if (it != extension_modules_.end()) {
1507  return it->second;
1508  }
1509  static const std::unique_ptr<llvm::Module> empty;
1510  return empty;
1511  }
std::map< ExtModuleKinds, std::unique_ptr< llvm::Module > > extension_modules_
Definition: Execute.h:1517

+ Here is the caller graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_geos_module ( ) const
inline

Definition at line 545 of file Execute.h.

References get_extension_module(), and rt_geos_module.

545  {
547  }
const std::unique_ptr< llvm::Module > & get_extension_module(ExtModuleKinds kind) const
Definition: Execute.h:1504

+ Here is the call graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_libdevice_module ( ) const
inline

Definition at line 548 of file Execute.h.

References get_extension_module(), and rt_libdevice_module.

548  {
550  }
const std::unique_ptr< llvm::Module > & get_extension_module(ExtModuleKinds kind) const
Definition: Execute.h:1504

+ Here is the call graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_rt_module ( ) const
inline

Definition at line 532 of file Execute.h.

References get_extension_module(), and template_module.

532  {
534  }
const std::unique_ptr< llvm::Module > & get_extension_module(ExtModuleKinds kind) const
Definition: Execute.h:1504

+ Here is the call graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_rt_udf_module ( bool  is_gpu = false) const
inline

Definition at line 539 of file Execute.h.

References get_extension_module(), register_runtime_extension_functions_mutex_, rt_udf_cpu_module, and rt_udf_gpu_module.

539  {
540  std::lock_guard<std::mutex> lock(
542  return get_extension_module(
544  }
const std::unique_ptr< llvm::Module > & get_extension_module(ExtModuleKinds kind) const
Definition: Execute.h:1504
static std::mutex register_runtime_extension_functions_mutex_
Definition: Execute.h:1640

+ Here is the call graph for this function:

const std::unique_ptr<llvm::Module>& Executor::get_udf_module ( bool  is_gpu = false) const
inline

Definition at line 535 of file Execute.h.

References get_extension_module(), udf_cpu_module, and udf_gpu_module.

535  {
536  return get_extension_module(
538  }
const std::unique_ptr< llvm::Module > & get_extension_module(ExtModuleKinds kind) const
Definition: Execute.h:1504

+ Here is the call graph for this function:

size_t Executor::getArenaBlockSize ( )
static

Definition at line 558 of file Execute.cpp.

References g_is_test_env, and kArenaBlockOverhead.

Referenced by ResultSetLogicalValuesBuilder::create(), RelAlgExecutor::prepareLeafExecution(), and setupCaching().

558  {
559  return g_is_test_env ? 100000000 : (1UL << 32) + kArenaBlockOverhead;
560 }
constexpr size_t kArenaBlockOverhead
bool g_is_test_env
Definition: Execute.cpp:149

+ Here is the caller graph for this function:

static size_t Executor::getBaselineThreshold ( bool  for_count_distinct,
ExecutorDeviceType  device_type 
)
inlinestatic

Definition at line 1448 of file Execute.h.

References baseline_threshold, and GPU.

Referenced by GroupByAndAggregate::getColRangeInfo().

1449  {
1450  return for_count_distinct ? (device_type == ExecutorDeviceType::GPU
1454  }
static const size_t baseline_threshold
Definition: Execute.h:1549
Executor(const ExecutorId id, Data_Namespace::DataMgr *data_mgr, const size_t block_size_x, const size_t grid_size_x, const size_t max_gpu_slab_size, const std::string &debug_dir, const std::string &debug_file)
Definition: Execute.cpp:272

+ Here is the caller graph for this function:

Executor::CachedCardinality Executor::getCachedCardinality ( const CardinalityCacheKey cache_key)

Definition at line 5264 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, recycler_mutex_, and VLOG.

5265  {
5268  cardinality_cache_.find(cache_key) != cardinality_cache_.end()) {
5269  VLOG(1) << "Reuse cached cardinality";
5270  return {true, cardinality_cache_[cache_key]};
5271  }
5272  return {false, -1};
5273 }
std::shared_lock< T > shared_lock
static std::unordered_map< CardinalityCacheKey, size_t > cardinality_cache_
Definition: Execute.h:1607
static heavyai::shared_mutex recycler_mutex_
Definition: Execute.h:1605
bool g_use_estimator_result_cache
Definition: Execute.cpp:135
#define VLOG(n)
Definition: Logger.h:388
CgenState* Executor::getCgenStatePtr ( ) const
inline

Definition at line 1414 of file Execute.h.

References cgen_state_.

1414 { return cgen_state_.get(); }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
ExecutorResourceMgr_Namespace::ChunkRequestInfo Executor::getChunkRequestInfo ( const ExecutorDeviceType  device_type,
const std::vector< InputDescriptor > &  input_descs,
const std::vector< InputTableInfo > &  query_infos,
const std::vector< std::pair< int32_t, FragmentsList >> &  device_fragment_lists 
) const

Determines a unique list of chunks and their associated byte sizes for a given query plan.

Called by Executor::launchKernelsViaResourceMgr

Note that we currently need the kernel's fragment lists generated in Executor::createKernels (which calls QueryFragmentDescriptor::buildFragmentKernelMap), but would be nice to hoist that logic out so that we could call this earlier, i.e. before compilation such that we don't waste compilation cycles in an attempt to run a query on GPU, only to see there are insufficient resources for it and it must be kicked to CPU

Note this method currently has two key limitations:

  1. Only accounts for chunks in the lhs table if a join is involved.
  2. Conservatively estimates that column widths for intermediate results are always 8 bytes, when in some cases they may have a lower byte width.
Parameters
device_type- specifies whether the query needs CPU or GPU buffer pool memory
input_descs- tables needed by the query
query_infos
kernel_fragment_lists
Returns
ExecutorResourceMgr_Namespace::ChunkRequestInfo - contains various info used by ExecutorResourceMgr to gate (and soon optimize scheduling of) query step resource requests.

Definition at line 852 of file Execute.cpp.

References gpu_enabled::accumulate(), CPU, and getColumnByteWidthMap().

Referenced by launchKernelsViaResourceMgr().

856  {
857  using TableFragmentId = std::pair<shared::TableKey, int32_t>;
858  using TableFragmentSizeMap = std::map<TableFragmentId, size_t>;
859 
860  /* Calculate bytes per column */
861 
862  // Only fetch lhs table ids for now...
863  // Allows us to cleanly lower number of kernels in flight to save
864  // buffer pool space, but is not a perfect estimate when big rhs
865  // join tables are involved. Will revisit.
866 
867  std::set<shared::TableKey> lhs_table_keys;
868  for (const auto& input_desc : input_descs) {
869  if (input_desc.getNestLevel() == 0) {
870  lhs_table_keys.insert(input_desc.getTableKey());
871  }
872  }
873 
874  const bool include_lazy_fetch_cols = device_type == ExecutorDeviceType::CPU;
875  const auto column_byte_width_map =
876  getColumnByteWidthMap(lhs_table_keys, include_lazy_fetch_cols);
877 
878  /* Calculate the byte width per row (sum of all columns widths)
879  Assumes each fragment touches the same columns, which is a DB-wide
880  invariant for now */
881 
882  size_t const byte_width_per_row =
883  std::accumulate(column_byte_width_map.begin(),
884  column_byte_width_map.end(),
885  size_t(0),
886  [](size_t sum, auto& col_entry) { return sum + col_entry.second; });
887 
888  /* Calculate num tuples for all fragments */
889 
890  TableFragmentSizeMap all_table_fragments_size_map;
891 
892  for (auto& query_info : query_infos) {
893  const auto& table_key = query_info.table_key;
894  for (const auto& frag : query_info.info.fragments) {
895  const int32_t frag_id = frag.fragmentId;
896  const TableFragmentId table_frag_id = std::make_pair(table_key, frag_id);
897  const size_t fragment_num_tuples = frag.getNumTuples(); // num_tuples;
898  all_table_fragments_size_map.insert(
899  std::make_pair(table_frag_id, fragment_num_tuples));
900  }
901  }
902 
903  /* Calculate num tuples only for fragments actually touched by query
904  Also calculate the num bytes needed for each kernel */
905 
906  TableFragmentSizeMap query_table_fragments_size_map;
907  std::vector<size_t> bytes_per_kernel;
908  bytes_per_kernel.reserve(kernel_fragment_lists.size());
909 
910  size_t max_kernel_bytes{0};
911 
912  for (auto& kernel_frag_list : kernel_fragment_lists) {
913  size_t kernel_bytes{0};
914  const auto frag_list = kernel_frag_list.second;
915  for (const auto& table_frags : frag_list) {
916  const auto& table_key = table_frags.table_key;
917  for (const size_t frag_id : table_frags.fragment_ids) {
918  const TableFragmentId table_frag_id = std::make_pair(table_key, frag_id);
919  const size_t fragment_num_tuples = all_table_fragments_size_map[table_frag_id];
920  kernel_bytes += fragment_num_tuples * byte_width_per_row;
921  query_table_fragments_size_map.insert(
922  std::make_pair(table_frag_id, fragment_num_tuples));
923  }
924  }
925  bytes_per_kernel.emplace_back(kernel_bytes);
926  if (kernel_bytes > max_kernel_bytes) {
927  max_kernel_bytes = kernel_bytes;
928  }
929  }
930 
931  /* Calculate bytes per chunk touched by the query */
932 
933  std::map<ChunkKey, size_t> all_chunks_byte_sizes_map;
934  constexpr int32_t subkey_min = std::numeric_limits<int32_t>::min();
935 
936  for (const auto& col_byte_width_entry : column_byte_width_map) {
937  // Build a chunk key prefix of (db_id, table_id, column_id)
938  const int32_t db_id = col_byte_width_entry.first.db_id;
939  const int32_t table_id = col_byte_width_entry.first.table_id;
940  const int32_t col_id = col_byte_width_entry.first.column_id;
941  const size_t col_byte_width = col_byte_width_entry.second;
942  const shared::TableKey table_key(db_id, table_id);
943 
944  const auto frag_start =
945  query_table_fragments_size_map.lower_bound({table_key, subkey_min});
946  for (auto frag_itr = frag_start; frag_itr != query_table_fragments_size_map.end() &&
947  frag_itr->first.first == table_key;
948  frag_itr++) {
949  const ChunkKey chunk_key = {db_id, table_id, col_id, frag_itr->first.second};
950  const size_t chunk_byte_size = col_byte_width * frag_itr->second;
951  all_chunks_byte_sizes_map.insert({chunk_key, chunk_byte_size});
952  }
953  }
954 
955  size_t total_chunk_bytes{0};
956  const size_t num_chunks = all_chunks_byte_sizes_map.size();
957  std::vector<std::pair<ChunkKey, size_t>> chunks_with_byte_sizes;
958  chunks_with_byte_sizes.reserve(num_chunks);
959  for (const auto& chunk_byte_size_entry : all_chunks_byte_sizes_map) {
960  chunks_with_byte_sizes.emplace_back(
961  std::make_pair(chunk_byte_size_entry.first, chunk_byte_size_entry.second));
962  // Add here, post mapping of the chunks, to make sure chunks are deduped and we get an
963  // accurate size estimate
964  total_chunk_bytes += chunk_byte_size_entry.second;
965  }
966  // Don't allow scaling of bytes per kernel launches for GPU yet as we're not set up for
967  // this at this point
968  const bool bytes_scales_per_kernel = device_type == ExecutorDeviceType::CPU;
969 
970  // Return ChunkRequestInfo
971 
972  return {device_type,
973  chunks_with_byte_sizes,
974  num_chunks,
975  total_chunk_bytes,
976  bytes_per_kernel,
977  max_kernel_bytes,
978  bytes_scales_per_kernel};
979 }
std::vector< int > ChunkKey
Definition: types.h:36
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
std::map< shared::ColumnKey, size_t > getColumnByteWidthMap(const std::set< shared::TableKey > &table_ids_to_fetch, const bool include_lazy_fetched_cols) const
Definition: Execute.cpp:794

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< ColumnLazyFetchInfo > Executor::getColLazyFetchInfo ( const std::vector< Analyzer::Expr * > &  target_exprs) const

Definition at line 992 of file Execute.cpp.

References CHECK, get_column_descriptor(), get_column_descriptor_maybe(), IS_GEO, kNULLT, and plan_state_.

Referenced by createKernels().

993  {
995  std::vector<ColumnLazyFetchInfo> col_lazy_fetch_info;
996  for (const auto target_expr : target_exprs) {
997  if (!plan_state_->isLazyFetchColumn(target_expr)) {
998  col_lazy_fetch_info.emplace_back(
999  ColumnLazyFetchInfo{false, -1, SQLTypeInfo(kNULLT, false)});
1000  } else {
1001  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1002  CHECK(col_var);
1003  auto rte_idx = (col_var->get_rte_idx() == -1) ? 0 : col_var->get_rte_idx();
1004  const auto cd = get_column_descriptor_maybe(col_var->getColumnKey());
1005  if (cd && IS_GEO(cd->columnType.get_type())) {
1006  // Geo coords cols will be processed in sequence. So we only need to track the
1007  // first coords col in lazy fetch info.
1008  {
1009  auto col_key = col_var->getColumnKey();
1010  col_key.column_id += 1;
1011  const auto cd0 = get_column_descriptor(col_key);
1012  const auto col0_ti = cd0->columnType;
1013  CHECK(!cd0->isVirtualCol);
1014  const auto col0_var = makeExpr<Analyzer::ColumnVar>(col0_ti, col_key, rte_idx);
1015  const auto local_col0_id = plan_state_->getLocalColumnId(col0_var.get(), false);
1016  col_lazy_fetch_info.emplace_back(
1017  ColumnLazyFetchInfo{true, local_col0_id, col0_ti});
1018  }
1019  } else {
1020  auto local_col_id = plan_state_->getLocalColumnId(col_var, false);
1021  const auto& col_ti = col_var->get_type_info();
1022  col_lazy_fetch_info.emplace_back(ColumnLazyFetchInfo{true, local_col_id, col_ti});
1023  }
1024  }
1025  }
1026  return col_lazy_fetch_info;
1027 }
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241
const ColumnDescriptor * get_column_descriptor(const shared::ColumnKey &column_key)
Definition: Execute.h:213
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define CHECK(condition)
Definition: Logger.h:291
#define IS_GEO(T)
Definition: sqltypes.h:310

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ExpressionRange Executor::getColRange ( const PhysicalInput phys_input) const

Definition at line 721 of file Execute.cpp.

References agg_col_range_cache_, and AggregatedColRange::getColRange().

721  {
722  return agg_col_range_cache_.getColRange(phys_input);
723 }
ExpressionRange getColRange(const PhysicalInput &) const
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:1572

+ Here is the call graph for this function:

std::map< shared::ColumnKey, size_t > Executor::getColumnByteWidthMap ( const std::set< shared::TableKey > &  table_ids_to_fetch,
const bool  include_lazy_fetched_cols 
) const

Definition at line 794 of file Execute.cpp.

References CHECK, anonymous_namespace{Execute.cpp}::get_col_byte_width(), and plan_state_.

Referenced by getChunkRequestInfo().

796  {
797  std::map<shared::ColumnKey, size_t> col_byte_width_map;
798 
799  for (const auto& fetched_col : plan_state_->getColumnsToFetch()) {
800  if (table_ids_to_fetch.count({fetched_col.db_id, fetched_col.table_id}) == 0) {
801  continue;
802  }
803  const size_t col_byte_width = get_col_byte_width(fetched_col);
804  CHECK(col_byte_width_map.insert({fetched_col, col_byte_width}).second);
805  }
806  if (include_lazy_fetched_cols) {
807  for (const auto& lazy_fetched_col : plan_state_->getColumnsToNotFetch()) {
808  if (table_ids_to_fetch.count({lazy_fetched_col.db_id, lazy_fetched_col.table_id}) ==
809  0) {
810  continue;
811  }
812  const size_t col_byte_width = get_col_byte_width(lazy_fetched_col);
813  CHECK(col_byte_width_map.insert({lazy_fetched_col, col_byte_width}).second);
814  }
815  }
816  return col_byte_width_map;
817 }
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
size_t get_col_byte_width(const shared::ColumnKey &column_key)
Definition: Execute.cpp:766
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

const ColumnDescriptor * Executor::getColumnDescriptor ( const Analyzer::ColumnVar col_var) const

Definition at line 686 of file Execute.cpp.

References get_column_descriptor_maybe(), and Analyzer::ColumnVar::getColumnKey().

Referenced by getPhysicalColumnDescriptor().

687  {
688  return get_column_descriptor_maybe(col_var->getColumnKey());
689 }
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::LLVMContext& Executor::getContext ( )
inline

Definition at line 1417 of file Execute.h.

References context_.

1417 { return *context_.get(); }
std::unique_ptr< llvm::LLVMContext > context_
Definition: Execute.h:1477
QuerySessionId & Executor::getCurrentQuerySession ( heavyai::shared_lock< heavyai::shared_mutex > &  read_lock)

Definition at line 4952 of file Execute.cpp.

References current_query_session_.

Referenced by executePlanWithGroupBy(), executePlanWithoutGroupBy(), executeWorkUnitImpl(), fetchChunks(), and fetchUnionChunks().

4953  {
4954  return current_query_session_;
4955 }
QuerySessionId current_query_session_
Definition: Execute.h:1576

+ Here is the caller graph for this function:

Data_Namespace::DataMgr* Executor::getDataMgr ( ) const
inline

Definition at line 623 of file Execute.h.

References CHECK, and data_mgr_.

Referenced by getDeviceTypeForTargets(), logSystemCPUMemoryStatus(), and logSystemGPUMemoryStatus().

623  {
624  CHECK(data_mgr_);
625  return data_mgr_;
626  }
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1558
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

heavyai::shared_mutex & Executor::getDataRecyclerLock ( )

Definition at line 4936 of file Execute.cpp.

References recycler_mutex_.

4936  {
4937  return recycler_mutex_;
4938 }
static heavyai::shared_mutex recycler_mutex_
Definition: Execute.h:1605
ExecutorDeviceType Executor::getDeviceTypeForTargets ( const RelAlgExecutionUnit ra_exe_unit,
const ExecutorDeviceType  requested_device_type 
)
private

Definition at line 2546 of file Execute.cpp.

References CPU, g_bigint_count, get_target_info(), getDataMgr(), RelAlgExecutionUnit::groupby_exprs, isArchPascalOrLater(), kAVG, kDOUBLE, kSUM, kSUM_IF, and RelAlgExecutionUnit::target_exprs.

Referenced by executeWorkUnitImpl().

2548  {
2549  if (!getDataMgr()->gpusPresent()) {
2550  return ExecutorDeviceType::CPU;
2551  }
2552  for (const auto target_expr : ra_exe_unit.target_exprs) {
2553  const auto agg_info = get_target_info(target_expr, g_bigint_count);
2554  if (!ra_exe_unit.groupby_exprs.empty() &&
2555  !isArchPascalOrLater(requested_device_type)) {
2556  if ((agg_info.agg_kind == kAVG || agg_info.agg_kind == kSUM ||
2557  agg_info.agg_kind == kSUM_IF) &&
2558  agg_info.agg_arg_type.get_type() == kDOUBLE) {
2559  return ExecutorDeviceType::CPU;
2560  }
2561  }
2562  if (dynamic_cast<const Analyzer::RegexpExpr*>(target_expr)) {
2563  return ExecutorDeviceType::CPU;
2564  }
2565  }
2566  return requested_device_type;
2567 }
std::vector< Analyzer::Expr * > target_exprs
bool isArchPascalOrLater(const ExecutorDeviceType dt) const
Definition: Execute.h:872
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
bool g_bigint_count
Definition: sqldefs.h:77
Data_Namespace::DataMgr * getDataMgr() const
Definition: Execute.h:623
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr< Executor > Executor::getExecutor ( const ExecutorId  id,
const std::string &  debug_dir = "",
const std::string &  debug_file = "",
const SystemParameters system_parameters = SystemParameters() 
)
static

Definition at line 509 of file Execute.cpp.

References CHECK, SystemParameters::cuda_block_size, SystemParameters::cuda_grid_size, executors_, executors_cache_mutex_, Catalog_Namespace::SysCatalog::getDataMgr(), Catalog_Namespace::SysCatalog::instance(), and SystemParameters::max_gpu_slab_size.

Referenced by ResultSetReductionJIT::codegen(), GpuReductionHelperJIT::codegen(), ColumnarResults::ColumnarResults(), Parser::OptimizeTableStmt::execute(), Parser::CopyTableStmt::execute(), Parser::InsertValuesStmt::execute(), DBHandler::execute_rel_alg(), QueryRunner::QueryRunner::extractQueryPlanDag(), StubGenerator::generateStub(), DBHandler::get_queries_info(), QueryRunner::QueryRunner::getCalcitePlan(), Fragmenter_Namespace::FragmentInfo::getChunkMetadataMap(), QueryRunner::QueryRunner::getExecutor(), CgenState::getExecutor(), Parser::LocalQueryConnector::getOuterFragmentCount(), QueryRunner::QueryRunner::getParsedGlobalQueryHints(), QueryRunner::QueryRunner::getParsedQueryHint(), QueryRunner::QueryRunner::getParsedQueryHints(), DBHandler::getQueries(), QueryRunner::QueryRunner::getQueryInfoForDataRecyclerTest(), QueryRunner::QueryRunner::getRaExecutionSequence(), QueryRunner::QueryRunner::getRelAlgDag(), QueryRunner::QueryRunner::getRootNodeFromParsedQuery(), DBHandler::import_table(), import_export::Importer::importDelimited(), import_export::Importer::importGDALGeo(), import_export::Importer::importGDALRaster(), DBHandler::importGeoTableSingle(), DBHandler::interrupt(), DBHandler::interruptQuery(), DBHandler::invalidate_cur_session(), anonymous_namespace{DBHandler.cpp}::log_cache_size(), migrations::MigrationMgr::migrateDateInDaysMetadata(), Parser::InsertIntoTableAsSelectStmt::populateData(), Parser::LocalQueryConnector::query(), ResultSetStorage::reduceEntriesNoCollisionsColWise(), QueryRunner::anonymous_namespace{QueryRunner.cpp}::run_select_query_with_filter_push_down(), QueryRunner::QueryRunner::runSelectQuery(), QueryRunner::QueryRunner::runSQLWithAllowingInterrupt(), DBHandler::set_cur_session(), DBHandler::sql_execute_impl(), and anonymous_namespace{DdlCommandExecutor.cpp}::vacuum_table_if_required().

513  {
515  auto it = executors_.find(executor_id);
516  if (it != executors_.end()) {
517  return it->second;
518  }
520  auto executor = std::make_shared<Executor>(executor_id,
521  &data_mgr,
522  system_parameters.cuda_block_size,
523  system_parameters.cuda_grid_size,
524  system_parameters.max_gpu_slab_size,
525  debug_dir,
526  debug_file);
527  CHECK(executors_.insert(std::make_pair(executor_id, executor)).second);
528  return executor;
529 }
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:234
static SysCatalog & instance()
Definition: SysCatalog.h:343
std::unique_lock< T > unique_lock
static std::map< int, std::shared_ptr< Executor > > executors_
Definition: Execute.h:1581
#define CHECK(condition)
Definition: Logger.h:291
static heavyai::shared_mutex executors_cache_mutex_
Definition: Execute.h:1602

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ExecutorId Executor::getExecutorId ( ) const
inline

Definition at line 1332 of file Execute.h.

References executor_id_.

Referenced by launchKernelsViaResourceMgr().

1332 { return executor_id_; };
const ExecutorId executor_id_
Definition: Execute.h:1476

+ Here is the caller graph for this function:

const std::vector< size_t > Executor::getExecutorIdsRunningQuery ( const QuerySessionId interrupt_session) const

Definition at line 5313 of file Execute.cpp.

References executor_session_mutex_, queries_session_map_, and run_benchmark_import::res.

5314  {
5315  std::vector<size_t> res;
5317  auto it = queries_session_map_.find(interrupt_session);
5318  if (it != queries_session_map_.end()) {
5319  for (auto& kv : it->second) {
5320  if (kv.second.getQueryStatus() ==
5321  QuerySessionStatus::QueryStatus::RUNNING_QUERY_KERNEL) {
5322  res.push_back(kv.second.getExecutorId());
5323  }
5324  }
5325  }
5326  return res;
5327 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580
std::shared_lock< T > shared_lock
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
const SQLTypeInfo Executor::getFirstOrderColTypeInfo ( WindowFunctionContext window_func_context) const
private

Definition at line 722 of file WindowFunctionIR.cpp.

References Analyzer::WindowFunction::getOrderKeys(), and WindowFunctionContext::getWindowFunction().

723  {
724  const auto window_func = window_func_context->getWindowFunction();
725  return window_func->getOrderKeys().front()->get_type_info();
726 }
const std::vector< std::shared_ptr< Analyzer::Expr > > & getOrderKeys() const
Definition: Analyzer.h:2802
const Analyzer::WindowFunction * getWindowFunction() const

+ Here is the call graph for this function:

std::vector< size_t > Executor::getFragmentCount ( const FragmentsList selected_fragments,
const size_t  scan_idx,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 3728 of file Execute.cpp.

References RelAlgExecutionUnit::input_descs, RelAlgExecutionUnit::join_quals, and plan_state_.

Referenced by buildSelectedFragsMapping().

3730  {
3731  if ((ra_exe_unit.input_descs.size() > size_t(2) || !ra_exe_unit.join_quals.empty()) &&
3732  scan_idx > 0 &&
3733  !plan_state_->join_info_.sharded_range_table_indices_.count(scan_idx) &&
3734  !selected_fragments[scan_idx].fragment_ids.empty()) {
3735  // Fetch all fragments
3736  return {size_t(0)};
3737  }
3738 
3739  return selected_fragments[scan_idx].fragment_ids;
3740 }
std::vector< InputDescriptor > input_descs
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532

+ Here is the caller graph for this function:

std::string Executor::getFramingFuncName ( const std::string &  bound_type,
const std::string &  order_col_type,
const std::string &  op_type,
bool  for_timestamp_type 
) const
private

Definition at line 837 of file WindowFunctionIR.cpp.

840  {
841  auto target_val_type = for_timestamp_type ? "int64_t" : order_col_type;
842  auto null_type = for_timestamp_type ? "int64_t" : order_col_type;
843  return "range_mode_" + target_val_type + "_" + order_col_type + "_" + null_type + "_" +
844  op_type + "_frame_" + bound_type + "_bound";
845 }
std::unordered_map< shared::TableKey, const Analyzer::BinOper * > Executor::getInnerTabIdToJoinCond ( ) const
private

Definition at line 2849 of file Execute.cpp.

References CHECK_EQ, and plan_state_.

2849  {
2850  std::unordered_map<shared::TableKey, const Analyzer::BinOper*> id_to_cond;
2851  const auto& join_info = plan_state_->join_info_;
2852  CHECK_EQ(join_info.equi_join_tautologies_.size(), join_info.join_hash_tables_.size());
2853  for (size_t i = 0; i < join_info.join_hash_tables_.size(); ++i) {
2854  const auto& inner_table_key = join_info.join_hash_tables_[i]->getInnerTableId();
2855  id_to_cond.insert(
2856  std::make_pair(inner_table_key, join_info.equi_join_tautologies_[i].get()));
2857  }
2858  return id_to_cond;
2859 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
std::vector< int8_t * > Executor::getJoinHashTablePtrs ( const ExecutorDeviceType  device_type,
const int  device_id 
)
private

Definition at line 4219 of file Execute.cpp.

References CHECK, GPU, and plan_state_.

Referenced by executePlanWithGroupBy(), and executePlanWithoutGroupBy().

4220  {
4221  std::vector<int8_t*> table_ptrs;
4222  const auto& join_hash_tables = plan_state_->join_info_.join_hash_tables_;
4223  for (auto hash_table : join_hash_tables) {
4224  if (!hash_table) {
4225  CHECK(table_ptrs.empty());
4226  return {};
4227  }
4228  table_ptrs.push_back(hash_table->getJoinHashBuffer(
4229  device_type, device_type == ExecutorDeviceType::GPU ? device_id : 0));
4230  }
4231  return table_ptrs;
4232 }
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

const StringDictionaryProxy::IdMap * Executor::getJoinIntersectionStringProxyTranslationMap ( const StringDictionaryProxy source_proxy,
StringDictionaryProxy dest_proxy,
const std::vector< StringOps_Namespace::StringOpInfo > &  source_string_op_infos,
const std::vector< StringOps_Namespace::StringOpInfo > &  dest_source_string_op_infos,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner 
) const

Definition at line 617 of file Execute.cpp.

References CHECK, and str_dict_mutex_.

622  {
623  CHECK(row_set_mem_owner);
624  std::lock_guard<std::mutex> lock(
625  str_dict_mutex_); // TODO: can we use RowSetMemOwner state mutex here?
626  // First translate lhs onto itself if there are string ops
627  if (!dest_string_op_infos.empty()) {
628  row_set_mem_owner->addStringProxyUnionTranslationMap(
629  dest_proxy, dest_proxy, dest_string_op_infos);
630  }
631  return row_set_mem_owner->addStringProxyIntersectionTranslationMap(
632  source_proxy, dest_proxy, source_string_op_infos);
633 }
std::mutex str_dict_mutex_
Definition: Execute.h:1545
#define CHECK(condition)
Definition: Logger.h:291
const QueryPlanDAG Executor::getLatestQueryPlanDagExtracted ( ) const

Definition at line 5348 of file Execute.cpp.

References latest_query_plan_extracted_, and recycler_mutex_.

5348  {
5351 }
std::shared_lock< T > shared_lock
static QueryPlanDAG latest_query_plan_extracted_
Definition: Execute.h:1612
static heavyai::shared_mutex recycler_mutex_
Definition: Execute.h:1605
size_t Executor::getNumBytesForFetchedRow ( const std::set< shared::TableKey > &  table_keys_to_fetch) const
size_t Executor::getNumBytesForFetchedRow ( const std::set< int > &  table_ids_to_fetch) const
size_t Executor::getNumCurentSessionsEnrolled ( ) const

Definition at line 5081 of file Execute.cpp.

References executor_session_mutex_, and queries_session_map_.

5081  {
5083  return queries_session_map_.size();
5084 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580
std::shared_lock< T > shared_lock
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
size_t Executor::getOrderKeySize ( WindowFunctionContext window_func_context) const
private

Definition at line 728 of file WindowFunctionIR.cpp.

728  {
729  const auto order_key_size = getFirstOrderColTypeInfo(window_func_context).get_size();
730  return order_key_size;
731 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
const SQLTypeInfo getFirstOrderColTypeInfo(WindowFunctionContext *window_func_context) const
const std::string Executor::getOrderKeyTypeName ( WindowFunctionContext window_func_context) const
private

Definition at line 733 of file WindowFunctionIR.cpp.

References CHECK, anonymous_namespace{WindowFunctionIR.cpp}::get_col_type_name_by_size(), Analyzer::WindowFunction::getOrderKeys(), and WindowFunctionContext::getWindowFunction().

734  {
735  auto const order_key_size = getOrderKeySize(window_func_context);
736  auto const order_key_ptr =
737  window_func_context->getWindowFunction()->getOrderKeys().front();
738  CHECK(order_key_ptr);
739  return get_col_type_name_by_size(order_key_size,
740  order_key_ptr->get_type_info().is_fp());
741 }
std::string get_col_type_name_by_size(const size_t size, const bool is_fp)
const std::vector< std::shared_ptr< Analyzer::Expr > > & getOrderKeys() const
Definition: Analyzer.h:2802
#define CHECK(condition)
Definition: Logger.h:291
const Analyzer::WindowFunction * getWindowFunction() const
size_t getOrderKeySize(WindowFunctionContext *window_func_context) const

+ Here is the call graph for this function:

const ColumnDescriptor * Executor::getPhysicalColumnDescriptor ( const Analyzer::ColumnVar col_var,
int  n 
) const

Definition at line 691 of file Execute.cpp.

References shared::ColumnKey::column_id, get_column_descriptor_maybe(), getColumnDescriptor(), Analyzer::ColumnVar::getColumnKey(), and anonymous_namespace{Utm.h}::n.

693  {
694  const auto cd = getColumnDescriptor(col_var);
695  if (!cd || n > cd->columnType.get_physical_cols()) {
696  return nullptr;
697  }
698  auto column_key = col_var->getColumnKey();
699  column_key.column_id += n;
700  return get_column_descriptor_maybe(column_key);
701 }
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241
const ColumnDescriptor * getColumnDescriptor(const Analyzer::ColumnVar *) const
Definition: Execute.cpp:686
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198
constexpr double n
Definition: Utm.h:38

+ Here is the call graph for this function:

PlanState* Executor::getPlanStatePtr ( ) const
inline

Definition at line 1415 of file Execute.h.

References plan_state_.

1415 { return plan_state_.get(); }
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
QueryPlanDagCache & Executor::getQueryPlanDagCache ( )

Definition at line 4940 of file Execute.cpp.

References query_plan_dag_cache_.

4940  {
4941  return query_plan_dag_cache_;
4942 }
static QueryPlanDagCache query_plan_dag_cache_
Definition: Execute.h:1604
std::vector< QuerySessionStatus > Executor::getQuerySessionInfo ( const QuerySessionId query_session,
heavyai::shared_lock< heavyai::shared_mutex > &  read_lock 
)

Definition at line 5295 of file Execute.cpp.

References queries_session_map_.

5297  {
5298  if (!queries_session_map_.empty() && queries_session_map_.count(query_session)) {
5299  auto& query_infos = queries_session_map_.at(query_session);
5300  std::vector<QuerySessionStatus> ret;
5301  for (auto& info : query_infos) {
5302  ret.emplace_back(query_session,
5303  info.second.getExecutorId(),
5304  info.second.getQueryStr(),
5305  info.second.getQuerySubmittedTime(),
5306  info.second.getQueryStatus());
5307  }
5308  return ret;
5309  }
5310  return {};
5311 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580
QuerySessionStatus::QueryStatus Executor::getQuerySessionStatus ( const QuerySessionId candidate_query_session,
heavyai::shared_lock< heavyai::shared_mutex > &  read_lock 
)

Definition at line 4967 of file Execute.cpp.

References queries_session_map_.

4969  {
4970  if (queries_session_map_.count(candidate_query_session) &&
4971  !queries_session_map_.at(candidate_query_session).empty()) {
4972  return queries_session_map_.at(candidate_query_session)
4973  .begin()
4974  ->second.getQueryStatus();
4975  }
4976  return QuerySessionStatus::QueryStatus::UNDEFINED;
4977 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580
ResultSetRecyclerHolder & Executor::getResultSetRecyclerHolder ( )

Definition at line 4944 of file Execute.cpp.

References resultset_recycler_holder_.

4944  {
4946 }
static ResultSetRecyclerHolder resultset_recycler_holder_
Definition: Execute.h:1608
std::pair< std::vector< std::vector< int64_t > >, std::vector< std::vector< uint64_t > > > Executor::getRowCountAndOffsetForAllFrags ( const RelAlgExecutionUnit ra_exe_unit,
const CartesianProduct< std::vector< std::vector< size_t >>> &  frag_ids_crossjoin,
const std::vector< InputDescriptor > &  input_descs,
const std::map< shared::TableKey, const TableFragments * > &  all_tables_fragments 
)
private

Definition at line 3335 of file Execute.cpp.

References CHECK, CHECK_EQ, CHECK_LT, get_table_id_to_frag_offsets(), RelAlgExecutionUnit::join_quals, plan_state_, and RelAlgExecutionUnit::union_all.

Referenced by fetchChunks(), and fetchUnionChunks().

3339  {
3340  std::vector<std::vector<int64_t>> all_num_rows;
3341  std::vector<std::vector<uint64_t>> all_frag_offsets;
3342  const auto tab_id_to_frag_offsets =
3343  get_table_id_to_frag_offsets(input_descs, all_tables_fragments);
3344  std::unordered_map<size_t, size_t> outer_id_to_num_row_idx;
3345  for (const auto& selected_frag_ids : frag_ids_crossjoin) {
3346  std::vector<int64_t> num_rows;
3347  std::vector<uint64_t> frag_offsets;
3348  if (!ra_exe_unit.union_all) {
3349  CHECK_EQ(selected_frag_ids.size(), input_descs.size());
3350  }
3351  for (size_t tab_idx = 0; tab_idx < input_descs.size(); ++tab_idx) {
3352  const auto frag_id = ra_exe_unit.union_all ? 0 : selected_frag_ids[tab_idx];
3353  const auto fragments_it =
3354  all_tables_fragments.find(input_descs[tab_idx].getTableKey());
3355  CHECK(fragments_it != all_tables_fragments.end());
3356  const auto& fragments = *fragments_it->second;
3357  if (ra_exe_unit.join_quals.empty() || tab_idx == 0 ||
3358  plan_state_->join_info_.sharded_range_table_indices_.count(tab_idx)) {
3359  const auto& fragment = fragments[frag_id];
3360  num_rows.push_back(fragment.getNumTuples());
3361  } else {
3362  size_t total_row_count{0};
3363  for (const auto& fragment : fragments) {
3364  total_row_count += fragment.getNumTuples();
3365  }
3366  num_rows.push_back(total_row_count);
3367  }
3368  const auto frag_offsets_it =
3369  tab_id_to_frag_offsets.find(input_descs[tab_idx].getTableKey());
3370  CHECK(frag_offsets_it != tab_id_to_frag_offsets.end());
3371  const auto& offsets = frag_offsets_it->second;
3372  CHECK_LT(frag_id, offsets.size());
3373  frag_offsets.push_back(offsets[frag_id]);
3374  }
3375  all_num_rows.push_back(num_rows);
3376  // Fragment offsets of outer table should be ONLY used by rowid for now.
3377  all_frag_offsets.push_back(frag_offsets);
3378  }
3379  return {all_num_rows, all_frag_offsets};
3380 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::map< shared::TableKey, std::vector< uint64_t > > get_table_id_to_frag_offsets(const std::vector< InputDescriptor > &input_descs, const std::map< shared::TableKey, const TableFragments * > &all_tables_fragments)
Definition: Execute.cpp:3316
const std::optional< bool > union_all
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

const std::shared_ptr< RowSetMemoryOwner > Executor::getRowSetMemoryOwner ( ) const

Definition at line 703 of file Execute.cpp.

References row_set_mem_owner_.

Referenced by executeTableFunction(), TransientStringLiteralsVisitor::visitStringOper(), and TransientStringLiteralsVisitor::visitUOper().

703  {
704  return row_set_mem_owner_;
705 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:1533

+ Here is the caller graph for this function:

heavyai::shared_mutex & Executor::getSessionLock ( )

Definition at line 4948 of file Execute.cpp.

References executor_session_mutex_.

4948  {
4949  return executor_session_mutex_;
4950 }
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
StringDictionaryProxy* Executor::getStringDictionaryProxy ( const shared::StringDictKey dict_key,
const bool  with_generation 
) const
inline

Returns a string dictionary proxy using the currently active row set memory owner.

Definition at line 578 of file Execute.h.

References CHECK, and row_set_mem_owner_.

Referenced by addTransientStringLiterals(), and serializeLiterals().

579  {
581  return getStringDictionaryProxy(dict_key, row_set_mem_owner_, with_generation);
582  }
StringDictionaryProxy * getStringDictionaryProxy(const shared::StringDictKey &dict_key, const bool with_generation) const
Definition: Execute.h:578
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:1533
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

StringDictionaryProxy* Executor::getStringDictionaryProxy ( const shared::StringDictKey dict_key,
const std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const bool  with_generation 
) const
const StringDictionaryProxy::TranslationMap< Datum > * Executor::getStringProxyNumericTranslationMap ( const shared::StringDictKey source_dict_key,
const std::vector< StringOps_Namespace::StringOpInfo > &  string_op_infos,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const bool  with_generation 
) const

Definition at line 636 of file Execute.cpp.

References CHECK, and str_dict_mutex_.

640  {
641  CHECK(row_set_mem_owner);
642  std::lock_guard<std::mutex> lock(
643  str_dict_mutex_); // TODO: can we use RowSetMemOwner state mutex here?
644  return row_set_mem_owner->getOrAddStringProxyNumericTranslationMap(
645  source_dict_key, with_generation, string_op_infos);
646 }
std::mutex str_dict_mutex_
Definition: Execute.h:1545
#define CHECK(condition)
Definition: Logger.h:291
const StringDictionaryProxy::IdMap * Executor::getStringProxyTranslationMap ( const shared::StringDictKey source_dict_key,
const shared::StringDictKey dest_dict_key,
const RowSetMemoryOwner::StringTranslationType  translation_type,
const std::vector< StringOps_Namespace::StringOpInfo > &  string_op_infos,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const bool  with_generation 
) const

Definition at line 602 of file Execute.cpp.

References CHECK, and str_dict_mutex_.

Referenced by TransientStringLiteralsVisitor::visitStringOper(), and TransientStringLiteralsVisitor::visitUOper().

608  {
609  CHECK(row_set_mem_owner);
610  std::lock_guard<std::mutex> lock(
611  str_dict_mutex_); // TODO: can we use RowSetMemOwner state mutex here?
612  return row_set_mem_owner->getOrAddStringProxyTranslationMap(
613  source_dict_key, dest_dict_key, with_generation, translation_type, string_op_infos);
614 }
std::mutex str_dict_mutex_
Definition: Execute.h:1545
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

std::vector< size_t > Executor::getTableFragmentIndices ( const RelAlgExecutionUnit ra_exe_unit,
const ExecutorDeviceType  device_type,
const size_t  table_idx,
const size_t  outer_frag_idx,
std::map< shared::TableKey, const TableFragments * > &  selected_tables_fragments,
const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &  inner_table_id_to_join_condition 
)
private

Definition at line 3204 of file Execute.cpp.

References CHECK, CHECK_LT, RelAlgExecutionUnit::input_descs, and skipFragmentPair().

3211  {
3212  const auto& table_key = ra_exe_unit.input_descs[table_idx].getTableKey();
3213  auto table_frags_it = selected_tables_fragments.find(table_key);
3214  CHECK(table_frags_it != selected_tables_fragments.end());
3215  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
3216  const auto outer_table_fragments_it =
3217  selected_tables_fragments.find(outer_input_desc.getTableKey());
3218  const auto outer_table_fragments = outer_table_fragments_it->second;
3219  CHECK(outer_table_fragments_it != selected_tables_fragments.end());
3220  CHECK_LT(outer_frag_idx, outer_table_fragments->size());
3221  if (!table_idx) {
3222  return {outer_frag_idx};
3223  }
3224  const auto& outer_fragment_info = (*outer_table_fragments)[outer_frag_idx];
3225  auto& inner_frags = table_frags_it->second;
3226  CHECK_LT(size_t(1), ra_exe_unit.input_descs.size());
3227  std::vector<size_t> all_frag_ids;
3228  for (size_t inner_frag_idx = 0; inner_frag_idx < inner_frags->size();
3229  ++inner_frag_idx) {
3230  const auto& inner_frag_info = (*inner_frags)[inner_frag_idx];
3231  if (skipFragmentPair(outer_fragment_info,
3232  inner_frag_info,
3233  table_idx,
3234  inner_table_id_to_join_condition,
3235  ra_exe_unit,
3236  device_type)) {
3237  continue;
3238  }
3239  all_frag_ids.push_back(inner_frag_idx);
3240  }
3241  return all_frag_ids;
3242 }
std::vector< InputDescriptor > input_descs
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK(condition)
Definition: Logger.h:291
bool skipFragmentPair(const Fragmenter_Namespace::FragmentInfo &outer_fragment_info, const Fragmenter_Namespace::FragmentInfo &inner_fragment_info, const int inner_table_id, const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &inner_table_id_to_join_condition, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: Execute.cpp:3246

+ Here is the call graph for this function:

const TableGeneration & Executor::getTableGeneration ( const shared::TableKey table_key) const

Definition at line 716 of file Execute.cpp.

References TableGenerations::getGeneration(), and table_generations_.

Referenced by skipFragment().

717  {
718  return table_generations_.getGeneration(table_key);
719 }
const TableGeneration & getGeneration(const shared::TableKey &table_key) const
TableGenerations table_generations_
Definition: Execute.h:1573

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Fragmenter_Namespace::TableInfo Executor::getTableInfo ( const shared::TableKey table_key) const

Definition at line 711 of file Execute.cpp.

References InputTableInfoCache::getTableInfo(), and input_table_info_cache_.

Referenced by computeColRangesCache(), and computeTableGenerations().

712  {
713  return input_table_info_cache_.getTableInfo(table_key);
714 }
InputTableInfoCache input_table_info_cache_
Definition: Execute.h:1571
Fragmenter_Namespace::TableInfo getTableInfo(const shared::TableKey &table_key)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

const TemporaryTables* Executor::getTemporaryTables ( )
inline

Returns pointer to the intermediate tables vector currently stored by this executor.

Definition at line 573 of file Execute.h.

References temporary_tables_.

Referenced by skipFragmentPair().

573 { return temporary_tables_; }
const TemporaryTables * temporary_tables_
Definition: Execute.h:1559

+ Here is the caller graph for this function:

const TemporaryTables* Executor::getTemporaryTables ( ) const
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > Executor::getUniqueThreadSharedResultSets ( const std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &  results_per_device) const
private

Definition at line 1599 of file Execute.cpp.

References gpu_enabled::accumulate(), and run_benchmark_import::result.

Referenced by reduceMultiDeviceResults().

1601  {
1602  std::vector<std::pair<ResultSetPtr, std::vector<size_t>>> unique_thread_results;
1603  if (results_per_device.empty()) {
1604  return unique_thread_results;
1605  }
1606  auto max_ti = [](int acc, auto& e) { return std::max(acc, e.first->getThreadIdx()); };
1607  int const max_thread_idx =
1608  std::accumulate(results_per_device.begin(), results_per_device.end(), -1, max_ti);
1609  std::vector<bool> seen_thread_idxs(max_thread_idx + 1, false);
1610  for (const auto& result : results_per_device) {
1611  const int32_t result_thread_idx = result.first->getThreadIdx();
1612  if (!seen_thread_idxs[result_thread_idx]) {
1613  seen_thread_idxs[result_thread_idx] = true;
1614  unique_thread_results.emplace_back(result);
1615  }
1616  }
1617  return unique_thread_results;
1618 }
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

unsigned Executor::gridSize ( ) const

Definition at line 4318 of file Execute.cpp.

References CHECK, data_mgr_, Data_Namespace::DataMgr::getCudaMgr(), and grid_size_x_.

Referenced by collectAllDeviceShardedTopResults(), executePlanWithGroupBy(), executePlanWithoutGroupBy(), executeTableFunction(), executeWorkUnitImpl(), reduceMultiDeviceResults(), reduceMultiDeviceResultSets(), and resultsUnion().

4318  {
4319  CHECK(data_mgr_);
4320  const auto cuda_mgr = data_mgr_->getCudaMgr();
4321  if (!cuda_mgr) {
4322  return 0;
4323  }
4324  return grid_size_x_ ? grid_size_x_ : 2 * cuda_mgr->getMinNumMPsForAllDevices();
4325 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:235
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1558
unsigned grid_size_x_
Definition: Execute.h:1553
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Executor::GroupColLLVMValue Executor::groupByColumnCodegen ( Analyzer::Expr group_by_col,
const size_t  col_width,
const CompilationOptions co,
const bool  translate_null_val,
const int64_t  translated_null_val,
DiamondCodegen diamond_codegen,
std::stack< llvm::BasicBlock * > &  array_loops,
const bool  thread_mem_shared 
)
private

Definition at line 1383 of file IRCodegen.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_GE, CodeGenerator::codegen(), CompilationOptions::device_type, get_int_type(), Analyzer::Expr::get_type_info(), kDOUBLE, kUNNEST, log2_bytes(), need_patch_unnest_double(), numeric_type_name(), DiamondCodegen::orig_cond_false_, CodeGenerator::posArg(), and DiamondCodegen::setFalseTarget().

1391  {
1393  CHECK_GE(col_width, sizeof(int32_t));
1394  CodeGenerator code_generator(this);
1395  auto group_key = code_generator.codegen(group_by_col, true, co).front();
1396  auto key_to_cache = group_key;
1397  if (dynamic_cast<Analyzer::UOper*>(group_by_col) &&
1398  static_cast<Analyzer::UOper*>(group_by_col)->get_optype() == kUNNEST) {
1399  auto preheader = cgen_state_->ir_builder_.GetInsertBlock();
1400  auto array_loop_head = llvm::BasicBlock::Create(cgen_state_->context_,
1401  "array_loop_head",
1402  cgen_state_->current_func_,
1403  preheader->getNextNode());
1404  diamond_codegen.setFalseTarget(array_loop_head);
1405  const auto ret_ty = get_int_type(32, cgen_state_->context_);
1406  auto array_idx_ptr = cgen_state_->ir_builder_.CreateAlloca(ret_ty);
1407  CHECK(array_idx_ptr);
1408  cgen_state_->ir_builder_.CreateStore(cgen_state_->llInt(int32_t(0)), array_idx_ptr);
1409  const auto arr_expr = static_cast<Analyzer::UOper*>(group_by_col)->get_operand();
1410  const auto& array_ti = arr_expr->get_type_info();
1411  CHECK(array_ti.is_array());
1412  const auto& elem_ti = array_ti.get_elem_type();
1413  auto array_len =
1414  (array_ti.get_size() > 0)
1415  ? cgen_state_->llInt(array_ti.get_size() / elem_ti.get_size())
1416  : cgen_state_->emitExternalCall(
1417  "array_size",
1418  ret_ty,
1419  {group_key,
1420  code_generator.posArg(arr_expr),
1421  cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))});
1422  cgen_state_->ir_builder_.CreateBr(array_loop_head);
1423  cgen_state_->ir_builder_.SetInsertPoint(array_loop_head);
1424  CHECK(array_len);
1425  auto array_idx = cgen_state_->ir_builder_.CreateLoad(
1426  array_idx_ptr->getType()->getPointerElementType(), array_idx_ptr);
1427  auto bound_check = cgen_state_->ir_builder_.CreateICmp(
1428  llvm::ICmpInst::ICMP_SLT, array_idx, array_len);
1429  auto array_loop_body = llvm::BasicBlock::Create(
1430  cgen_state_->context_, "array_loop_body", cgen_state_->current_func_);
1431  cgen_state_->ir_builder_.CreateCondBr(
1432  bound_check,
1433  array_loop_body,
1434  array_loops.empty() ? diamond_codegen.orig_cond_false_ : array_loops.top());
1435  cgen_state_->ir_builder_.SetInsertPoint(array_loop_body);
1436  cgen_state_->ir_builder_.CreateStore(
1437  cgen_state_->ir_builder_.CreateAdd(array_idx, cgen_state_->llInt(int32_t(1))),
1438  array_idx_ptr);
1439  auto array_at_fname = "array_at_" + numeric_type_name(elem_ti);
1440  if (array_ti.get_size() < 0) {
1441  if (array_ti.get_notnull()) {
1442  array_at_fname = "notnull_" + array_at_fname;
1443  }
1444  array_at_fname = "varlen_" + array_at_fname;
1445  }
1446  const auto ar_ret_ty =
1447  elem_ti.is_fp()
1448  ? (elem_ti.get_type() == kDOUBLE
1449  ? llvm::Type::getDoubleTy(cgen_state_->context_)
1450  : llvm::Type::getFloatTy(cgen_state_->context_))
1451  : get_int_type(elem_ti.get_logical_size() * 8, cgen_state_->context_);
1452  group_key = cgen_state_->emitExternalCall(
1453  array_at_fname,
1454  ar_ret_ty,
1455  {group_key, code_generator.posArg(arr_expr), array_idx});
1457  elem_ti, isArchMaxwell(co.device_type), thread_mem_shared)) {
1458  key_to_cache = spillDoubleElement(group_key, ar_ret_ty);
1459  } else {
1460  key_to_cache = group_key;
1461  }
1462  CHECK(array_loop_head);
1463  array_loops.push(array_loop_head);
1464  }
1465  cgen_state_->group_by_expr_cache_.push_back(key_to_cache);
1466  llvm::Value* orig_group_key{nullptr};
1467  if (translate_null_val) {
1468  const std::string translator_func_name(
1469  col_width == sizeof(int32_t) ? "translate_null_key_i32_" : "translate_null_key_");
1470  const auto& ti = group_by_col->get_type_info();
1471  const auto key_type = get_int_type(ti.get_logical_size() * 8, cgen_state_->context_);
1472  orig_group_key = group_key;
1473  group_key = cgen_state_->emitCall(
1474  translator_func_name + numeric_type_name(ti),
1475  {group_key,
1476  static_cast<llvm::Value*>(
1477  llvm::ConstantInt::get(key_type, inline_int_null_val(ti))),
1478  static_cast<llvm::Value*>(llvm::ConstantInt::get(
1479  llvm::Type::getInt64Ty(cgen_state_->context_), translated_null_val))});
1480  }
1481  group_key = cgen_state_->ir_builder_.CreateBitCast(
1482  cgen_state_->castToTypeIn(group_key, col_width * 8),
1483  get_int_type(col_width * 8, cgen_state_->context_));
1484  if (orig_group_key) {
1485  orig_group_key = cgen_state_->ir_builder_.CreateBitCast(
1486  cgen_state_->castToTypeIn(orig_group_key, col_width * 8),
1487  get_int_type(col_width * 8, cgen_state_->context_));
1488  }
1489  return {group_key, orig_group_key};
1490 }
#define CHECK_GE(x, y)
Definition: Logger.h:306
bool need_patch_unnest_double(const SQLTypeInfo &ti, const bool is_maxwell, const bool mem_shared)
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
void setFalseTarget(llvm::BasicBlock *cond_false)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
llvm::BasicBlock * orig_cond_false_
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
ExecutorDeviceType device_type
llvm::Value * spillDoubleElement(llvm::Value *elem_val, llvm::Type *elem_ty)
std::unique_ptr< llvm::LLVMContext > context_
Definition: Execute.h:1477
#define CHECK(condition)
Definition: Logger.h:291
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:198
std::string numeric_type_name(const SQLTypeInfo &ti)
Definition: Execute.h:230
bool isArchMaxwell(const ExecutorDeviceType dt) const

+ Here is the call graph for this function:

bool Executor::has_extension_module ( ExtModuleKinds  kind) const
inlineprivate

Definition at line 1513 of file Execute.h.

References extension_modules_.

Referenced by has_geos_module(), has_libdevice_module(), has_rt_module(), has_rt_udf_module(), and has_udf_module().

1513  {
1514  return extension_modules_.find(kind) != extension_modules_.end();
1515  }
std::map< ExtModuleKinds, std::unique_ptr< llvm::Module > > extension_modules_
Definition: Execute.h:1517

+ Here is the caller graph for this function:

bool Executor::has_geos_module ( ) const
inline

Definition at line 563 of file Execute.h.

References has_extension_module(), and rt_geos_module.

563  {
565  }
bool has_extension_module(ExtModuleKinds kind) const
Definition: Execute.h:1513

+ Here is the call graph for this function:

bool Executor::has_libdevice_module ( ) const
inline

Definition at line 566 of file Execute.h.

References has_extension_module(), and rt_libdevice_module.

566  {
568  }
bool has_extension_module(ExtModuleKinds kind) const
Definition: Execute.h:1513

+ Here is the call graph for this function:

bool Executor::has_rt_module ( ) const
inline

Definition at line 552 of file Execute.h.

References has_extension_module(), and template_module.

552  {
554  }
bool has_extension_module(ExtModuleKinds kind) const
Definition: Execute.h:1513

+ Here is the call graph for this function:

bool Executor::has_rt_udf_module ( bool  is_gpu = false) const
inline

Definition at line 559 of file Execute.h.

References has_extension_module(), rt_udf_cpu_module, and rt_udf_gpu_module.

+ Here is the call graph for this function:

bool Executor::has_udf_module ( bool  is_gpu = false) const
inline

Definition at line 555 of file Execute.h.

References has_extension_module(), udf_cpu_module, and udf_gpu_module.

+ Here is the call graph for this function:

bool Executor::hasLazyFetchColumns ( const std::vector< Analyzer::Expr * > &  target_exprs) const

Definition at line 981 of file Execute.cpp.

References CHECK, and plan_state_.

982  {
984  for (const auto target_expr : target_exprs) {
985  if (plan_state_->isLazyFetchColumn(target_expr)) {
986  return true;
987  }
988  }
989  return false;
990 }
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define CHECK(condition)
Definition: Logger.h:291
void Executor::init_resource_mgr ( const size_t  num_cpu_slots,
const size_t  num_gpu_slots,
const size_t  cpu_result_mem,
const size_t  cpu_buffer_pool_mem,
const size_t  gpu_buffer_pool_mem,
const double  per_query_max_cpu_slots_ratio,
const double  per_query_max_cpu_result_mem_ratio,
const bool  allow_cpu_kernel_concurrency,
const bool  allow_cpu_gpu_kernel_concurrency,
const bool  allow_cpu_slot_oversubscription_concurrency,
const bool  allow_cpu_result_mem_oversubscription,
const double  max_available_resource_use_ratio 
)
static

Definition at line 5353 of file Execute.cpp.

References executor_resource_mgr_, and ExecutorResourceMgr_Namespace::generate_executor_resource_mgr().

Referenced by DBHandler::init_executor_resource_mgr(), and QueryRunner::QueryRunner::QueryRunner().

5365  {
5366  const double per_query_max_pinned_cpu_buffer_pool_mem_ratio{1.0};
5367  const double per_query_max_pageable_cpu_buffer_pool_mem_ratio{0.5};
5369  num_cpu_slots,
5370  num_gpu_slots,
5371  cpu_result_mem,
5372  cpu_buffer_pool_mem,
5373  gpu_buffer_pool_mem,
5374  per_query_max_cpu_slots_ratio,
5375  per_query_max_cpu_result_mem_ratio,
5376  per_query_max_pinned_cpu_buffer_pool_mem_ratio,
5377  per_query_max_pageable_cpu_buffer_pool_mem_ratio,
5378  allow_cpu_kernel_concurrency,
5379  allow_cpu_gpu_kernel_concurrency,
5380  allow_cpu_slot_oversubscription_concurrency,
5381  true, // allow_gpu_slot_oversubscription
5382  allow_cpu_result_mem_oversubscription_concurrency,
5383  max_available_resource_use_ratio);
5384 }
std::shared_ptr< ExecutorResourceMgr > generate_executor_resource_mgr(const size_t num_cpu_slots, const size_t num_gpu_slots, const size_t cpu_result_mem, const size_t cpu_buffer_pool_mem, const size_t gpu_buffer_pool_mem, const double per_query_max_cpu_slots_ratio, const double per_query_max_cpu_result_mem_ratio, const double per_query_max_pinned_cpu_buffer_pool_mem_ratio, const double per_query_max_pageable_cpu_buffer_pool_mem_ratio, const bool allow_cpu_kernel_concurrency, const bool allow_cpu_gpu_kernel_concurrency, const bool allow_cpu_slot_oversubscription_concurrency, const bool allow_gpu_slot_oversubscription, const bool allow_cpu_result_mem_oversubscription_concurrency, const double max_available_resource_use_ratio)
Convenience factory-esque method that allows us to use the same logic to generate an ExecutorResource...
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::initialize_extension_module_sources ( )
static

Definition at line 294 of file Execute.cpp.

References CHECK, extension_module_sources, get_cuda_libdevice_dir(), heavyai::get_root_abs_path(), LOG, rt_geos_module, rt_libdevice_module, template_module, and logger::WARNING.

Referenced by input_table_info_cache_().

294  {
298  auto root_path = heavyai::get_root_abs_path();
299  auto template_path = root_path + "/QueryEngine/RuntimeFunctions.bc";
300  CHECK(boost::filesystem::exists(template_path));
302  template_path;
303 #ifdef ENABLE_GEOS
304  auto rt_geos_path = root_path + "/QueryEngine/GeosRuntime.bc";
305  CHECK(boost::filesystem::exists(rt_geos_path));
307  rt_geos_path;
308 #endif
309 #ifdef HAVE_CUDA
310  auto rt_libdevice_path = get_cuda_libdevice_dir() + "/libdevice.10.bc";
311  if (boost::filesystem::exists(rt_libdevice_path)) {
313  rt_libdevice_path;
314  } else {
315  LOG(WARNING) << "File " << rt_libdevice_path
316  << " does not exist; support for some UDF "
317  "functions might not be available.";
318  }
319 #endif
320  }
321 }
std::string get_cuda_libdevice_dir(void)
Definition: CudaMgr.cpp:612
std::string get_root_abs_path()
#define LOG(tag)
Definition: Logger.h:285
static std::map< ExtModuleKinds, std::string > extension_module_sources
Definition: Execute.h:528
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::initializeNVPTXBackend ( ) const
private

Definition at line 1549 of file NativeCodegen.cpp.

1549  {
1550  if (nvptx_target_machine_) {
1551  return;
1552  }
1553  const auto arch = cudaMgr()->getDeviceArch();
1555 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:865
std::unique_ptr< llvm::TargetMachine > nvptx_target_machine_
Definition: Execute.h:1547
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:186
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
std::vector< llvm::Value * > Executor::inlineHoistedLiterals ( )
private

Definition at line 2374 of file NativeCodegen.cpp.

2374  {
2376 
2377  std::vector<llvm::Value*> hoisted_literals;
2378 
2379  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
2380  // extend row_func_ signature to include extra args to pass these literal values.
2381  std::vector<llvm::Type*> row_process_arg_types;
2382 
2383  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2384  E = cgen_state_->row_func_->arg_end();
2385  I != E;
2386  ++I) {
2387  row_process_arg_types.push_back(I->getType());
2388  }
2389 
2390  for (auto& element : cgen_state_->query_func_literal_loads_) {
2391  for (auto value : element.second) {
2392  row_process_arg_types.push_back(value->getType());
2393  }
2394  }
2395 
2396  auto ft = llvm::FunctionType::get(
2397  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
2398  auto row_func_with_hoisted_literals =
2399  llvm::Function::Create(ft,
2400  llvm::Function::ExternalLinkage,
2401  "row_func_hoisted_literals",
2402  cgen_state_->row_func_->getParent());
2403 
2404  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2405  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2406  E = cgen_state_->row_func_->arg_end();
2407  I != E;
2408  ++I) {
2409  if (I->hasName()) {
2410  row_func_arg_it->setName(I->getName());
2411  }
2412  ++row_func_arg_it;
2413  }
2414 
2415  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
2416  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
2417  if (cgen_state_->filter_func_) {
2418  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
2419  // extend filter_func_ signature to include extra args to pass these literal values.
2420  std::vector<llvm::Type*> filter_func_arg_types;
2421 
2422  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2423  E = cgen_state_->filter_func_->arg_end();
2424  I != E;
2425  ++I) {
2426  filter_func_arg_types.push_back(I->getType());
2427  }
2428 
2429  for (auto& element : cgen_state_->query_func_literal_loads_) {
2430  for (auto value : element.second) {
2431  filter_func_arg_types.push_back(value->getType());
2432  }
2433  }
2434 
2435  auto ft2 = llvm::FunctionType::get(
2436  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
2437  filter_func_with_hoisted_literals =
2438  llvm::Function::Create(ft2,
2439  llvm::Function::ExternalLinkage,
2440  "filter_func_hoisted_literals",
2441  cgen_state_->filter_func_->getParent());
2442 
2443  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2444  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2445  E = cgen_state_->filter_func_->arg_end();
2446  I != E;
2447  ++I) {
2448  if (I->hasName()) {
2449  filter_func_arg_it->setName(I->getName());
2450  }
2451  ++filter_func_arg_it;
2452  }
2453  }
2454 
2455  std::unordered_map<int, std::vector<llvm::Value*>>
2456  query_func_literal_loads_function_arguments,
2457  query_func_literal_loads_function_arguments2;
2458 
2459  for (auto& element : cgen_state_->query_func_literal_loads_) {
2460  std::vector<llvm::Value*> argument_values, argument_values2;
2461 
2462  for (auto value : element.second) {
2463  hoisted_literals.push_back(value);
2464  argument_values.push_back(&*row_func_arg_it);
2465  if (cgen_state_->filter_func_) {
2466  argument_values2.push_back(&*filter_func_arg_it);
2467  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2468  }
2469  if (value->hasName()) {
2470  row_func_arg_it->setName("arg_" + value->getName());
2471  if (cgen_state_->filter_func_) {
2472  filter_func_arg_it->getContext();
2473  filter_func_arg_it->setName("arg_" + value->getName());
2474  }
2475  }
2476  ++row_func_arg_it;
2477  ++filter_func_arg_it;
2478  }
2479 
2480  query_func_literal_loads_function_arguments[element.first] = argument_values;
2481  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2482  }
2483 
2484  // copy the row_func function body over
2485  // see
2486  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2487  row_func_with_hoisted_literals->getBasicBlockList().splice(
2488  row_func_with_hoisted_literals->begin(),
2489  cgen_state_->row_func_->getBasicBlockList());
2490 
2491  // also replace row_func arguments with the arguments from row_func_hoisted_literals
2492  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2493  E = cgen_state_->row_func_->arg_end(),
2494  I2 = row_func_with_hoisted_literals->arg_begin();
2495  I != E;
2496  ++I) {
2497  I->replaceAllUsesWith(&*I2);
2498  I2->takeName(&*I);
2499  cgen_state_->filter_func_args_.replace(&*I, &*I2);
2500  ++I2;
2501  }
2502 
2503  cgen_state_->row_func_ = row_func_with_hoisted_literals;
2504 
2505  // and finally replace literal placeholders
2506  std::vector<llvm::Instruction*> placeholders;
2507  std::string prefix("__placeholder__literal_");
2508  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2509  e = llvm::inst_end(row_func_with_hoisted_literals);
2510  it != e;
2511  ++it) {
2512  if (it->hasName() && it->getName().startswith(prefix)) {
2513  auto offset_and_index_entry =
2514  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2515  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2516 
2517  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2518  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2519 
2520  it->replaceAllUsesWith(
2521  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2522  placeholders.push_back(&*it);
2523  }
2524  }
2525  for (auto placeholder : placeholders) {
2526  placeholder->removeFromParent();
2527  }
2528 
2529  if (cgen_state_->filter_func_) {
2530  // copy the filter_func function body over
2531  // see
2532  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2533  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2534  filter_func_with_hoisted_literals->begin(),
2535  cgen_state_->filter_func_->getBasicBlockList());
2536 
2537  // also replace filter_func arguments with the arguments from
2538  // filter_func_hoisted_literals
2539  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2540  E = cgen_state_->filter_func_->arg_end(),
2541  I2 = filter_func_with_hoisted_literals->arg_begin();
2542  I != E;
2543  ++I) {
2544  I->replaceAllUsesWith(&*I2);
2545  I2->takeName(&*I);
2546  ++I2;
2547  }
2548 
2549  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2550 
2551  // and finally replace literal placeholders
2552  std::vector<llvm::Instruction*> placeholders;
2553  std::string prefix("__placeholder__literal_");
2554  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2555  e = llvm::inst_end(filter_func_with_hoisted_literals);
2556  it != e;
2557  ++it) {
2558  if (it->hasName() && it->getName().startswith(prefix)) {
2559  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2560  llvm::dyn_cast<llvm::Value>(&*it));
2561  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2562 
2563  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2564  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2565 
2566  it->replaceAllUsesWith(
2567  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2568  placeholders.push_back(&*it);
2569  }
2570  }
2571  for (auto placeholder : placeholders) {
2572  placeholder->removeFromParent();
2573  }
2574  }
2575 
2576  return hoisted_literals;
2577 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:291
void Executor::insertErrorCodeChecker ( llvm::Function *  query_func,
unsigned const  error_code_idx,
bool  hoist_literals,
bool  allow_runtime_query_interrupt 
)
private

Definition at line 3243 of file NativeCodegen.cpp.

3246  {
3247  auto query_stub_func_name =
3248  "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
3249  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
3250  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
3251  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
3252  continue;
3253  }
3254  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
3255  auto const row_func_name = CodegenUtil::getCalledFunctionName(row_func_call);
3256  if (row_func_name && *row_func_name == query_stub_func_name) {
3257  auto next_inst_it = inst_it;
3258  ++next_inst_it;
3259  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
3260  auto& br_instr = bb_it->back();
3261  llvm::IRBuilder<> ir_builder(&br_instr);
3262  llvm::Value* err_lv = &*inst_it;
3263  auto error_check_bb =
3264  bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
3265  // query_func does not have parameter names assigned.
3266  llvm::Value* const error_code_arg = get_arg_by_index(query_func, error_code_idx);
3267  CHECK(error_code_arg) << error_code_idx << '/' << query_func->arg_size();
3268  llvm::Value* err_code = nullptr;
3269  if (allow_runtime_query_interrupt) {
3270  // decide the final error code with a consideration of interrupt status
3271  auto& check_interrupt_br_instr = bb_it->back();
3272  auto interrupt_check_bb = llvm::BasicBlock::Create(
3273  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
3274  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
3275  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
3276  cgen_state_->module_->getFunction("check_interrupt"), {});
3277  auto detected_error = interrupt_checker_ir_builder.CreateCall(
3278  cgen_state_->module_->getFunction("get_error_code"),
3279  std::vector<llvm::Value*>{error_code_arg});
3280  err_code = interrupt_checker_ir_builder.CreateSelect(
3281  detected_interrupt,
3283  detected_error);
3284  interrupt_checker_ir_builder.CreateBr(error_check_bb);
3285  llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
3286  llvm::BranchInst::Create(interrupt_check_bb));
3287  ir_builder.SetInsertPoint(&br_instr);
3288  } else {
3289  // uses error code returned from row_func and skip to check interrupt status
3290  ir_builder.SetInsertPoint(&br_instr);
3291  err_code =
3292  ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
3293  std::vector<llvm::Value*>{error_code_arg});
3294  }
3295  err_lv = ir_builder.CreateICmp(
3296  llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
3297  auto error_bb = llvm::BasicBlock::Create(
3298  cgen_state_->context_, ".error_exit", query_func, new_bb);
3299  llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
3300  std::vector<llvm::Value*>{err_code, error_code_arg},
3301  "",
3302  error_bb);
3303  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
3304  llvm::ReplaceInstWithInst(&br_instr,
3305  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
3306  break;
3307  }
3308  }
3309  }
3310 }
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1623
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
#define CHECK(condition)
Definition: Logger.h:291
llvm::Value * get_arg_by_index(llvm::Function *func, unsigned const index)
Definition: Execute.h:178
void Executor::interrupt ( const QuerySessionId query_session = "",
const QuerySessionId interrupt_session = "" 
)

Definition at line 42 of file GpuInterrupt.cpp.

References CHECK, CHECK_EQ, CHECK_GE, check_interrupt_init(), checkCudaErrors(), data_mgr_(), DW_ABORT, dw_abort, dynamic_watchdog_init(), g_enable_dynamic_watchdog, g_enable_non_kernel_time_query_interrupt, g_enable_runtime_query_interrupt, INT_ABORT, runtime_interrupt_flag, to_string(), and VLOG.

43  {
44  const auto allow_interrupt =
46  if (allow_interrupt) {
47  bool is_running_query = false;
48  {
49  // here we validate the requested query session is valid (is already enrolled)
50  // if not, we skip the interrupt request
53  if (!checkIsQuerySessionEnrolled(query_session, session_read_lock)) {
54  VLOG(1) << "Skip the interrupt request (no query has been submitted from the "
55  "given query session)";
56  return;
57  }
58  if (checkIsQuerySessionInterrupted(query_session, session_read_lock)) {
59  VLOG(1) << "Skip the interrupt request (already interrupted query session)";
60  return;
61  }
62  // if a query is pending query, we just need to turn interrupt flag for the session
63  // on (not sending interrupt signal to "RUNNING" kernel, see the below code)
64  is_running_query = checkCurrentQuerySession(query_session, session_read_lock);
65  }
66  {
67  // We have to cover interrupt request from *any* session because we don't know
68  // whether the request is for the running query or pending query
69  // or for non-kernel time interrupt
70  // (or just false alarm that indicates unregistered session in a queue).
71  // So we try to set a session has been interrupted once we confirm
72  // the session has been enrolled and is not interrupted at this moment
75  setQuerySessionAsInterrupted(query_session, session_write_lock);
76  }
77  if (!is_running_query) {
78  return;
79  }
80  // mark the interrupted status of this executor
81  interrupted_.store(true);
82  }
83 
84  // for both GPU and CPU kernel execution, interrupt flag that running kernel accesses
85  // is a global variable from a view of Executors
86  // but it's okay for now since we hold a kernel_lock when starting the query execution
87  // this indicates we should revisit this logic when starting to use multi-query
88  // execution for supporting per-kernel interrupt
89  bool CPU_execution_mode = true;
90 
91 #ifdef HAVE_CUDA
92  // The below code is basically for runtime query interrupt for GPU.
93  // It is also possible that user forces to use CPU-mode even if the user has GPU(s).
94  // In this case, we should not execute the code in below to avoid runtime failure
96  auto cuda_mgr = data_mgr_->getCudaMgr();
97  if (cuda_mgr && (g_enable_dynamic_watchdog || allow_interrupt)) {
98  // we additionally allow sending interrupt signal for
99  // `g_enable_non_kernel_time_query_interrupt` especially for CTAS/ITAS queries: data
100  // population happens on CPU but select_query can be processed via GPU
101  CHECK_GE(cuda_mgr->getDeviceCount(), 1);
102  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
103  CUcontext old_cu_context;
104  checkCudaErrors(cuCtxGetCurrent(&old_cu_context));
105  for (int device_id = 0; device_id < max_gpu_count; device_id++) {
106  if (gpu_active_modules_device_mask_ & (1 << device_id)) {
107  void* llvm_module = gpu_active_modules_[device_id];
108  auto cu_module = static_cast<CUmodule>(llvm_module);
109  if (!cu_module) {
110  continue;
111  } else {
112  VLOG(1) << "Try to interrupt the running query on GPU assigned to Executor "
113  << executor_id_;
114  CPU_execution_mode = false;
115  }
116  cuda_mgr->setContext(device_id);
117 
118  // Create high priority non-blocking communication stream
119  CUstream cu_stream1;
121  cuStreamCreateWithPriority(&cu_stream1, CU_STREAM_NON_BLOCKING, 1));
122 
123  CUevent start, stop;
124  cuEventCreate(&start, 0);
125  cuEventCreate(&stop, 0);
126  cuEventRecord(start, cu_stream1);
127 
130  size_t dw_abort_size;
131  if (cuModuleGetGlobal(&dw_abort, &dw_abort_size, cu_module, "dw_abort") ==
132  CUDA_SUCCESS) {
133  CHECK_EQ(dw_abort_size, sizeof(uint32_t));
134  int32_t abort_val = 1;
135  checkCudaErrors(cuMemcpyHtoDAsync(dw_abort,
136  reinterpret_cast<void*>(&abort_val),
137  sizeof(int32_t),
138  cu_stream1));
139 
140  if (device_id == 0) {
141  VLOG(1) << "GPU: Async Abort submitted to Device "
142  << std::to_string(device_id);
143  }
144  }
145  }
146 
147  if (allow_interrupt) {
149  size_t runtime_interrupt_flag_size;
150  auto status = cuModuleGetGlobal(&runtime_interrupt_flag,
151  &runtime_interrupt_flag_size,
152  cu_module,
153  "runtime_interrupt_flag");
154  if (status == CUDA_SUCCESS) {
155  VLOG(1) << "Executor " << executor_id_
156  << " retrieves interrupt status from GPU " << device_id;
157  CHECK_EQ(runtime_interrupt_flag_size, sizeof(uint32_t));
158  int32_t abort_val = 1;
159  checkCudaErrors(cuMemcpyHtoDAsync(runtime_interrupt_flag,
160  reinterpret_cast<void*>(&abort_val),
161  sizeof(int32_t),
162  cu_stream1));
163  if (device_id == 0) {
164  VLOG(1) << "GPU: send interrupt signal from Executor " << executor_id_
165  << " to Device " << std::to_string(device_id);
166  }
167  } else if (status == CUDA_ERROR_NOT_FOUND) {
168  std::runtime_error(
169  "Runtime query interrupt on Executor " + std::to_string(executor_id_) +
170  " has failed: an interrupt flag on the GPU could "
171  "not be initialized (CUDA_ERROR_CODE: CUDA_ERROR_NOT_FOUND)");
172  } else {
173  // if we reach here, query runtime interrupt is failed due to
174  // one of the following error: CUDA_ERROR_NOT_INITIALIZED,
175  // CUDA_ERROR_DEINITIALIZED. CUDA_ERROR_INVALID_CONTEXT, and
176  // CUDA_ERROR_INVALID_VALUE. All those error codes are due to device failure.
177  const char* error_ret_str = nullptr;
178  cuGetErrorName(status, &error_ret_str);
179  if (!error_ret_str) {
180  error_ret_str = "UNKNOWN";
181  }
182  std::string error_str(error_ret_str);
183  std::runtime_error(
184  "Runtime interrupt on Executor " + std::to_string(executor_id_) +
185  " has failed due to a device " + std::to_string(device_id) +
186  "'s issue "
187  "(CUDA_ERROR_CODE: " +
188  error_str + ")");
189  }
190 
191  cuEventRecord(stop, cu_stream1);
192  cuEventSynchronize(stop);
193  float milliseconds = 0;
194  cuEventElapsedTime(&milliseconds, start, stop);
195  VLOG(1) << "Device " << std::to_string(device_id)
196  << ": submitted async interrupt request from Executor " << executor_id_
197  << " : SUCCESS: " << std::to_string(milliseconds) << " ms";
198  checkCudaErrors(cuStreamDestroy(cu_stream1));
199  }
200  }
201  checkCudaErrors(cuCtxSetCurrent(old_cu_context));
202  }
203  }
204 #endif
206  dynamic_watchdog_init(static_cast<unsigned>(DW_ABORT));
207  }
208 
209  if (allow_interrupt && CPU_execution_mode) {
210  // turn interrupt flag on for CPU mode
211  VLOG(1) << "Try to interrupt the running query on CPU from Executor " << executor_id_;
212  check_interrupt_init(static_cast<unsigned>(INT_ABORT));
213  }
214 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:235
#define CHECK_EQ(x, y)
Definition: Logger.h:301
bool checkIsQuerySessionInterrupted(const std::string &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:5220
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1558
std::atomic< bool > interrupted_
Definition: Execute.h:1543
int CUcontext
Definition: nocuda.h:22
static const int max_gpu_count
Definition: Execute.h:1535
void * CUstream
Definition: nocuda.h:23
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
unsigned long long CUdeviceptr
Definition: nocuda.h:28
#define CHECK_GE(x, y)
Definition: Logger.h:306
static void * gpu_active_modules_[max_gpu_count]
Definition: Execute.h:1541
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:81
static uint32_t gpu_active_modules_device_mask_
Definition: Execute.h:1540
bool g_enable_non_kernel_time_query_interrupt
Definition: Execute.cpp:134
bool checkCurrentQuerySession(const std::string &candidate_query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:4957
std::string to_string(char const *&&v)
std::shared_lock< T > shared_lock
const ExecutorId executor_id_
Definition: Execute.h:1476
std::unique_lock< T > unique_lock
bool checkIsQuerySessionEnrolled(const QuerySessionId &query_session, heavyai::shared_lock< heavyai::shared_mutex > &read_lock)
Definition: Execute.cpp:5231
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95
RUNTIME_EXPORT uint64_t dynamic_watchdog_init(unsigned ms_budget)
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574
#define CHECK(condition)
Definition: Logger.h:291
RUNTIME_EXPORT bool check_interrupt_init(unsigned command)
void setQuerySessionAsInterrupted(const QuerySessionId &query_session, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
Definition: Execute.cpp:5209
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:94
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:133
#define VLOG(n)
Definition: Logger.h:388
void * CUmodule
Definition: nocuda.h:24
static std::mutex gpu_active_modules_mutex_
Definition: Execute.h:1539

+ Here is the call graph for this function:

void Executor::invalidateCardinalityCacheForTable ( const shared::TableKey table_key)
static

Definition at line 5282 of file Execute.cpp.

References cardinality_cache_, g_use_estimator_result_cache, and recycler_mutex_.

Referenced by clearExternalCaches().

5282  {
5285  for (auto it = cardinality_cache_.begin(); it != cardinality_cache_.end();) {
5286  if (it->first.containsTableKey(table_key)) {
5287  it = cardinality_cache_.erase(it);
5288  } else {
5289  it++;
5290  }
5291  }
5292  }
5293 }
std::unique_lock< T > unique_lock
static std::unordered_map< CardinalityCacheKey, size_t > cardinality_cache_
Definition: Execute.h:1607
static heavyai::shared_mutex recycler_mutex_
Definition: Execute.h:1605
bool g_use_estimator_result_cache
Definition: Execute.cpp:135

+ Here is the caller graph for this function:

void Executor::invalidateRunningQuerySession ( heavyai::unique_lock< heavyai::shared_mutex > &  write_lock)

Definition at line 4979 of file Execute.cpp.

References current_query_session_.

Referenced by clearQuerySessionStatus().

4980  {
4982 }
QuerySessionId current_query_session_
Definition: Execute.h:1576

+ Here is the caller graph for this function:

bool Executor::isArchMaxwell ( const ExecutorDeviceType  dt) const

Definition at line 25 of file MaxwellCodegenPatch.cpp.

References GPU.

25  {
26  return dt == ExecutorDeviceType::GPU && cudaMgr()->isArchMaxwell();
27 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:865
bool isArchMaxwell() const
Definition: CudaMgr.h:147
bool Executor::isArchPascalOrLater ( const ExecutorDeviceType  dt) const
inlineprivate

Definition at line 872 of file Execute.h.

References cudaMgr(), GPU, and CudaMgr_Namespace::CudaMgr::isArchPascalOrLater().

Referenced by getDeviceTypeForTargets().

872  {
873  if (dt == ExecutorDeviceType::GPU) {
874  return cudaMgr()->isArchPascalOrLater();
875  }
876  return false;
877  }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:865
bool isArchPascalOrLater() const
Definition: CudaMgr.h:156

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool Executor::isCPUOnly ( ) const

Definition at line 681 of file Execute.cpp.

References CHECK, data_mgr_, and Data_Namespace::DataMgr::getCudaMgr().

681  {
682  CHECK(data_mgr_);
683  return !data_mgr_->getCudaMgr();
684 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:235
Data_Namespace::DataMgr * data_mgr_
Definition: Execute.h:1558
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

bool Executor::isFragmentFullyDeleted ( const InputDescriptor table_desc,
const Fragmenter_Namespace::FragmentInfo fragment 
)
private

Definition at line 4527 of file Execute.cpp.

References CHECK, extract_max_stat_int_type(), extract_min_stat_int_type(), Catalog_Namespace::SysCatalog::getCatalog(), Fragmenter_Namespace::FragmentInfo::getChunkMetadataMap(), InputDescriptor::getTableKey(), Catalog_Namespace::SysCatalog::instance(), and Fragmenter_Namespace::FragmentInfo::physicalTableId.

Referenced by skipFragment().

4529  {
4530  // Skip temporary tables
4531  const auto& table_key = table_desc.getTableKey();
4532  if (table_key.table_id < 0) {
4533  return false;
4534  }
4535 
4536  const auto catalog =
4538  CHECK(catalog);
4539  const auto td = catalog->getMetadataForTable(fragment.physicalTableId);
4540  CHECK(td);
4541  const auto deleted_cd = catalog->getDeletedColumnIfRowsDeleted(td);
4542  if (!deleted_cd) {
4543  return false;
4544  }
4545 
4546  const auto& chunk_type = deleted_cd->columnType;
4547  CHECK(chunk_type.is_boolean());
4548 
4549  const auto deleted_col_id = deleted_cd->columnId;
4550  auto chunk_meta_it = fragment.getChunkMetadataMap().find(deleted_col_id);
4551  if (chunk_meta_it != fragment.getChunkMetadataMap().end()) {
4552  const int64_t chunk_min =
4553  extract_min_stat_int_type(chunk_meta_it->second->chunkStats, chunk_type);
4554  const int64_t chunk_max =
4555  extract_max_stat_int_type(chunk_meta_it->second->chunkStats, chunk_type);
4556  if (chunk_min == 1 && chunk_max == 1) { // Delete chunk if metadata says full bytemap
4557  // is true (signifying all rows deleted)
4558  return true;
4559  }
4560  }
4561  return false;
4562 }
int64_t extract_max_stat_int_type(const ChunkStats &stats, const SQLTypeInfo &ti)
static SysCatalog & instance()
Definition: SysCatalog.h:343
int64_t extract_min_stat_int_type(const ChunkStats &stats, const SQLTypeInfo &ti)
const ChunkMetadataMap & getChunkMetadataMap() const
const shared::TableKey & getTableKey() const
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::launchKernelsImpl ( SharedKernelContext shared_context,
std::vector< std::unique_ptr< ExecutionKernel >> &&  kernels,
const ExecutorDeviceType  device_type,
const size_t  requested_num_threads 
)
private

Launches execution kernels created by createKernels asynchronously using a thread pool.

Definition at line 3008 of file Execute.cpp.

References SharedKernelContext::addDeviceResults(), auto_num_threads, CHECK, CPU, cpu_threads(), DEBUG_TIMER_NEW_THREAD, RelAlgExecutionUnit::estimator, logger::EXECUTOR, g_enable_cpu_sub_tasks, LOG, threading_std::task_group::run(), SharedKernelContext::setNumAllocatedThreads(), logger::thread_local_ids(), VLOG, and threading_std::task_group::wait().

Referenced by launchKernelsLocked(), and launchKernelsViaResourceMgr().

3011  {
3012 #ifdef HAVE_TBB
3013  const size_t num_threads =
3014  requested_num_threads == Executor::auto_num_threads
3015  ? std::min(kernels.size(), static_cast<size_t>(cpu_threads()))
3016  : requested_num_threads;
3017  tbb::task_arena local_arena(num_threads);
3018 #else
3019  const size_t num_threads = cpu_threads();
3020 #endif
3021  shared_context.setNumAllocatedThreads(num_threads);
3022  LOG(EXECUTOR) << "Launching query step with " << num_threads << " threads.";
3024  // A hack to have unused unit for results collection.
3025  const RelAlgExecutionUnit* ra_exe_unit =
3026  kernels.empty() ? nullptr : &kernels[0]->ra_exe_unit_;
3027 
3028 #ifdef HAVE_TBB
3029  if (g_enable_cpu_sub_tasks && device_type == ExecutorDeviceType::CPU) {
3030  shared_context.setThreadPool(&tg);
3031  }
3032  ScopeGuard pool_guard([&shared_context]() { shared_context.setThreadPool(nullptr); });
3033 #endif // HAVE_TBB
3034 
3035  VLOG(1) << "Launching " << kernels.size() << " kernels for query on "
3036  << (device_type == ExecutorDeviceType::CPU ? "CPU"s : "GPU"s)
3037  << " using pool of " << num_threads << " threads.";
3038  size_t kernel_idx = 1;
3039 
3040  for (auto& kernel : kernels) {
3041  CHECK(kernel.get());
3042 #ifdef HAVE_TBB
3043  local_arena.execute([&] {
3044 #endif
3045  tg.run([this,
3046  &kernel,
3047  &shared_context,
3048  parent_thread_local_ids = logger::thread_local_ids(),
3049  num_threads,
3050  crt_kernel_idx = kernel_idx++] {
3051  logger::LocalIdsScopeGuard lisg = parent_thread_local_ids.setNewThreadId();
3052  DEBUG_TIMER_NEW_THREAD(parent_thread_local_ids.thread_id_);
3053  // Keep monotonicity of thread_idx by kernel launch time, so that optimizations
3054  // such as launching kernels with data already in pool first become possible
3055 #ifdef HAVE_TBB
3056  const size_t old_thread_idx = crt_kernel_idx % num_threads;
3057  const size_t thread_idx = tbb::this_task_arena::current_thread_index();
3058  LOG(EXECUTOR) << "Thread idx: " << thread_idx
3059  << " Old thread idx: " << old_thread_idx;
3060 #else
3061  const size_t thread_idx = crt_kernel_idx % num_threads;
3062 #endif
3063  kernel->run(this, thread_idx, shared_context);
3064  });
3065 #ifdef HAVE_TBB
3066  }); // local_arena.execute[&]
3067 #endif
3068  }
3069 #ifdef HAVE_TBB
3070  local_arena.execute([&] { tg.wait(); });
3071 #else
3072  tg.wait();
3073 #endif
3074 
3075  for (auto& exec_ctx : shared_context.getTlsExecutionContext()) {
3076  // The first arg is used for GPU only, it's not our case.
3077  // TODO: add QueryExecutionContext::getRowSet() interface
3078  // for our case.
3079  if (exec_ctx) {
3080  ResultSetPtr results;
3081  if (ra_exe_unit->estimator) {
3082  results = std::shared_ptr<ResultSet>(exec_ctx->estimator_result_set_.release());
3083  } else {
3084  results = exec_ctx->getRowSet(*ra_exe_unit, exec_ctx->query_mem_desc_);
3085  }
3086  shared_context.addDeviceResults(std::move(results), {});
3087  }
3088  }
3089 }
#define LOG(tag)
Definition: Logger.h:285
void addDeviceResults(ResultSetPtr &&device_results, std::vector< size_t > outer_table_fragment_ids)
#define DEBUG_TIMER_NEW_THREAD(parent_thread_id)
Definition: Logger.h:417
std::shared_ptr< ResultSet > ResultSetPtr
const std::shared_ptr< Analyzer::Estimator > estimator
static const size_t auto_num_threads
Definition: Execute.h:1536
#define CHECK(condition)
Definition: Logger.h:291
bool g_enable_cpu_sub_tasks
Definition: Execute.cpp:85
void setNumAllocatedThreads(size_t num_threads)
int cpu_threads()
Definition: thread_count.h:25
ThreadLocalIds thread_local_ids()
Definition: Logger.cpp:880
#define VLOG(n)
Definition: Logger.h:388

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::launchKernelsLocked ( SharedKernelContext shared_context,
std::vector< std::unique_ptr< ExecutionKernel >> &&  kernels,
const ExecutorDeviceType  device_type 
)
private

Definition at line 3091 of file Execute.cpp.

References auto_num_threads, kernel_mutex_, kernel_queue_time_ms_, launchKernelsImpl(), timer_start(), and timer_stop().

Referenced by executeWorkUnitImpl().

3094  {
3095  auto clock_begin = timer_start();
3096  std::lock_guard<std::mutex> kernel_lock(kernel_mutex_);
3097  kernel_queue_time_ms_ += timer_stop(clock_begin);
3098 
3100  shared_context, std::move(kernels), device_type, Executor::auto_num_threads);
3101 }
int64_t kernel_queue_time_ms_
Definition: Execute.h:1562
void launchKernelsImpl(SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type, const size_t requested_num_threads)
Definition: Execute.cpp:3008
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
static std::mutex kernel_mutex_
Definition: Execute.h:1641
static const size_t auto_num_threads
Definition: Execute.h:1536
Type timer_start()
Definition: measure.h:42

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::launchKernelsViaResourceMgr ( SharedKernelContext shared_context,
std::vector< std::unique_ptr< ExecutionKernel >> &&  kernels,
const ExecutorDeviceType  device_type,
const std::vector< InputDescriptor > &  input_descs,
const QueryMemoryDescriptor query_mem_desc 
)
private

Launches a vector of kernels for a given query step, gated/scheduled by ExecutorResourceMgr.

This function first calculates the neccessary CPU, GPU, result set memory and buffer pool memory neccessary for the query, which it then requests from ExecutorResourceMgr. The query thread will be conditionally put into a wait state until there are enough resources to execute the query, which might or might not be concurrently with other query steps, depending on the resource grant policies in place and the resources needed by this thread's query step and all other in-flight queries requesting resources. After the thread is given the green light by ExecutorResourceMgr, it then calls launchKernelsImpl which does the actual work of launching the kernels.

Parameters
shared_context- used to obtain InputTableInfo vector (query_infos) used for input chunk calculation
kernels- vector of kernels that will be launched, one per fragment for CPU execution, but can be multi-fragment (one per device) for GPU execution
device_type- specifies whether the query step should run on CPU or GPU
input_descs- neccessary to get the input table and column ids for a query for input chunk calculation
query_mem_desc- neccessary to get result set size per kernel

Definition at line 3103 of file Execute.cpp.

References ExecutorResourceMgr_Namespace::CPU_SLOTS, executor_resource_mgr_, QueryMemoryDescriptor::getBufferSizeBytes(), getChunkRequestInfo(), getExecutorId(), SharedKernelContext::getQueryInfos(), GPU, ExecutorResourceMgr_Namespace::GPU_SLOTS, kernel_queue_time_ms_, launchKernelsImpl(), query_mem_desc, QueryMemoryDescriptor::threadsCanReuseGroupByBuffers(), timer_start(), timer_stop(), and VLOG.

Referenced by executeWorkUnitImpl().

3108  {
3109  // CPU queries in general, plus some GPU queries, i.e. certain types of top-k sorts,
3110  // can generate more kernels than cores/GPU devices, so allow handle this for now
3111  // by capping the number of requested slots from GPU than actual GPUs
3112  const size_t num_kernels = kernels.size();
3113  constexpr bool cap_slots = false;
3114  const size_t num_compute_slots =
3115  cap_slots
3116  ? std::min(num_kernels,
3118  ->get_resource_info(
3119  device_type == ExecutorDeviceType::GPU
3122  .second)
3123  : num_kernels;
3124  const size_t cpu_result_mem_bytes_per_kernel =
3125  query_mem_desc.getBufferSizeBytes(device_type);
3126 
3127  std::vector<std::pair<int32_t, FragmentsList>> kernel_fragments_list;
3128  kernel_fragments_list.reserve(num_kernels);
3129  for (auto& kernel : kernels) {
3130  const auto device_id = kernel->get_chosen_device_id();
3131  const auto frag_list = kernel->get_fragment_list();
3132  if (!frag_list.empty()) {
3133  kernel_fragments_list.emplace_back(std::make_pair(device_id, frag_list));
3134  }
3135  }
3136  const auto chunk_request_info = getChunkRequestInfo(
3137  device_type, input_descs, shared_context.getQueryInfos(), kernel_fragments_list);
3138 
3139  auto gen_resource_request_info = [device_type,
3140  num_compute_slots,
3141  cpu_result_mem_bytes_per_kernel,
3142  &chunk_request_info,
3143  &query_mem_desc]() {
3144  if (device_type == ExecutorDeviceType::GPU) {
3146  device_type,
3147  static_cast<size_t>(0), // priority_level
3148  static_cast<size_t>(0), // cpu_slots
3149  static_cast<size_t>(0), // min_cpu_slots,
3150  num_compute_slots, // gpu_slots
3151  num_compute_slots, // min_gpu_slots
3152  cpu_result_mem_bytes_per_kernel * num_compute_slots, // cpu_result_mem,
3153  cpu_result_mem_bytes_per_kernel * num_compute_slots, // min_cpu_result_mem,
3154  chunk_request_info, // chunks needed
3155  false); // output_buffers_reusable_intra_thrad
3156  } else {
3157  const size_t min_cpu_slots{1};
3158  const size_t min_cpu_result_mem =
3159  query_mem_desc.threadsCanReuseGroupByBuffers()
3160  ? cpu_result_mem_bytes_per_kernel * min_cpu_slots
3161  : cpu_result_mem_bytes_per_kernel * num_compute_slots;
3163  device_type,
3164  static_cast<size_t>(0), // priority_level
3165  num_compute_slots, // cpu_slots
3166  min_cpu_slots, // min_cpu_slots
3167  size_t(0), // gpu_slots
3168  size_t(0), // min_gpu_slots
3169  cpu_result_mem_bytes_per_kernel * num_compute_slots, // cpu_result_mem
3170  min_cpu_result_mem, // min_cpu_result_mem
3171  chunk_request_info, // chunks needed
3172  query_mem_desc
3173  .threadsCanReuseGroupByBuffers()); // output_buffers_reusable_intra_thread
3174  }
3175  };
3176 
3177  const auto resource_request_info = gen_resource_request_info();
3178 
3179  auto clock_begin = timer_start();
3180  const bool is_empty_request =
3181  resource_request_info.cpu_slots == 0UL && resource_request_info.gpu_slots == 0UL;
3182  auto resource_handle =
3183  is_empty_request ? nullptr
3184  : executor_resource_mgr_->request_resources(resource_request_info);
3185  const auto num_cpu_threads =
3186  is_empty_request ? 0UL : resource_handle->get_resource_grant().cpu_slots;
3187  if (device_type == ExecutorDeviceType::GPU) {
3188  const auto num_gpu_slots =
3189  is_empty_request ? 0UL : resource_handle->get_resource_grant().gpu_slots;
3190  VLOG(1) << "In Executor::LaunchKernels executor " << getExecutorId() << " requested "
3191  << "between " << resource_request_info.min_gpu_slots << " and "
3192  << resource_request_info.gpu_slots << " GPU slots, and was granted "
3193  << num_gpu_slots << " GPU slots.";
3194  } else {
3195  VLOG(1) << "In Executor::LaunchKernels executor " << getExecutorId() << " requested "
3196  << "between " << resource_request_info.min_cpu_slots << " and "
3197  << resource_request_info.cpu_slots << " CPU slots, and was granted "
3198  << num_cpu_threads << " CPU slots.";
3199  }
3200  kernel_queue_time_ms_ += timer_stop(clock_begin);
3201  launchKernelsImpl(shared_context, std::move(kernels), device_type, num_cpu_threads);
3202 }
A container to store requested and minimum neccessary resource requests across all resource types cur...
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
int64_t kernel_queue_time_ms_
Definition: Execute.h:1562
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645
void launchKernelsImpl(SharedKernelContext &shared_context, std::vector< std::unique_ptr< ExecutionKernel >> &&kernels, const ExecutorDeviceType device_type, const size_t requested_num_threads)
Definition: Execute.cpp:3008
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
ExecutorResourceMgr_Namespace::ChunkRequestInfo getChunkRequestInfo(const ExecutorDeviceType device_type, const std::vector< InputDescriptor > &input_descs, const std::vector< InputTableInfo > &query_infos, const std::vector< std::pair< int32_t, FragmentsList >> &device_fragment_lists) const
Determines a unique list of chunks and their associated byte sizes for a given query plan...
Definition: Execute.cpp:852
bool threadsCanReuseGroupByBuffers() const
const std::vector< InputTableInfo > & getQueryInfos() const
ExecutorId getExecutorId() const
Definition: Execute.h:1332
#define VLOG(n)
Definition: Logger.h:388
Type timer_start()
Definition: measure.h:42

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::logSystemCPUMemoryStatus ( std::string const &  tag,
size_t const  thread_idx 
) const

Definition at line 740 of file Execute.cpp.

References executor_id_, g_allow_memory_status_log, getDataMgr(), Data_Namespace::DataMgr::getSystemMemoryUsage(), anonymous_namespace{Execute.cpp}::log_system_memory_info_impl(), timer_start(), and timer_stop().

741  {
743  auto timer = timer_start();
744  std::ostringstream oss;
745  oss << getDataMgr()->getSystemMemoryUsage();
747  oss.str(), executor_id_, timer_stop(timer), log_tag, thread_idx);
748  }
749 }
void log_system_memory_info_impl(std::string const &mem_log, size_t executor_id, size_t log_time_ms, std::string const &log_tag, size_t const thread_idx)
Definition: Execute.cpp:727
bool g_allow_memory_status_log
Definition: Execute.cpp:123
SystemMemoryUsage getSystemMemoryUsage() const
Definition: DataMgr.cpp:123
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
const ExecutorId executor_id_
Definition: Execute.h:1476
Data_Namespace::DataMgr * getDataMgr() const
Definition: Execute.h:623
Type timer_start()
Definition: measure.h:42

+ Here is the call graph for this function:

void Executor::logSystemGPUMemoryStatus ( std::string const &  tag,
size_t const  thread_idx 
) const

Definition at line 751 of file Execute.cpp.

References executor_id_, g_allow_memory_status_log, Data_Namespace::DataMgr::getCudaMgr(), getDataMgr(), anonymous_namespace{Execute.cpp}::log_system_memory_info_impl(), timer_start(), and timer_stop().

752  {
753 #ifdef HAVE_CUDA
754  if (g_allow_memory_status_log && getDataMgr() && getDataMgr()->gpusPresent() &&
755  getDataMgr()->getCudaMgr()) {
756  auto timer = timer_start();
757  auto mem_log = getDataMgr()->getCudaMgr()->getCudaMemoryUsageInString();
759  mem_log, executor_id_, timer_stop(timer), log_tag, thread_idx);
760  }
761 #endif
762 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:235
void log_system_memory_info_impl(std::string const &mem_log, size_t executor_id, size_t log_time_ms, std::string const &log_tag, size_t const thread_idx)
Definition: Execute.cpp:727
bool g_allow_memory_status_log
Definition: Execute.cpp:123
TypeR::rep timer_stop(Type clock_begin)
Definition: measure.h:48
const ExecutorId executor_id_
Definition: Execute.h:1476
Data_Namespace::DataMgr * getDataMgr() const
Definition: Execute.h:623
Type timer_start()
Definition: measure.h:42

+ Here is the call graph for this function:

size_t Executor::maxGpuSlabSize ( ) const

Definition at line 4358 of file Execute.cpp.

References max_gpu_slab_size_.

4358  {
4359  return max_gpu_slab_size_;
4360 }
const size_t max_gpu_slab_size_
Definition: Execute.h:1554
bool Executor::needFetchAllFragments ( const InputColDescriptor col_desc,
const RelAlgExecutionUnit ra_exe_unit,
const FragmentsList selected_fragments 
) const
private

Definition at line 3384 of file Execute.cpp.

References CHECK_EQ, CHECK_LT, InputDescriptor::getNestLevel(), InputColDescriptor::getScanDesc(), InputDescriptor::getSourceType(), InputDescriptor::getTableKey(), RelAlgExecutionUnit::input_descs, RelAlgExecutionUnit::join_quals, plan_state_, and TABLE.

Referenced by fetchChunks(), and fetchUnionChunks().

3386  {
3387  const auto& input_descs = ra_exe_unit.input_descs;
3388  const int nest_level = inner_col_desc.getScanDesc().getNestLevel();
3389  if (nest_level < 1 ||
3390  inner_col_desc.getScanDesc().getSourceType() != InputSourceType::TABLE ||
3391  ra_exe_unit.join_quals.empty() || input_descs.size() < 2 ||
3392  (ra_exe_unit.join_quals.empty() &&
3393  plan_state_->isLazyFetchColumn(inner_col_desc))) {
3394  return false;
3395  }
3396  const auto& table_key = inner_col_desc.getScanDesc().getTableKey();
3397  CHECK_LT(static_cast<size_t>(nest_level), selected_fragments.size());
3398  CHECK_EQ(table_key, selected_fragments[nest_level].table_key);
3399  const auto& fragments = selected_fragments[nest_level].fragment_ids;
3400  return fragments.size() > 1;
3401 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::vector< InputDescriptor > input_descs
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
#define CHECK_LT(x, y)
Definition: Logger.h:303

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool Executor::needLinearizeAllFragments ( const ColumnDescriptor cd,
const InputColDescriptor inner_col_desc,
const RelAlgExecutionUnit ra_exe_unit,
const FragmentsList selected_fragments,
const Data_Namespace::MemoryLevel  memory_level 
) const
private

Definition at line 3403 of file Execute.cpp.

References CHECK_EQ, CHECK_LT, ColumnDescriptor::columnType, InputDescriptor::getNestLevel(), InputColDescriptor::getScanDesc(), InputDescriptor::getTableKey(), SQLTypeInfo::is_array(), SQLTypeInfo::is_dict_encoded_type(), and SQLTypeInfo::is_string().

Referenced by fetchChunks().

3408  {
3409  const int nest_level = inner_col_desc.getScanDesc().getNestLevel();
3410  const auto& table_key = inner_col_desc.getScanDesc().getTableKey();
3411  CHECK_LT(static_cast<size_t>(nest_level), selected_fragments.size());
3412  CHECK_EQ(table_key, selected_fragments[nest_level].table_key);
3413  const auto& fragments = selected_fragments[nest_level].fragment_ids;
3414  auto need_linearize =
3415  cd->columnType.is_array() ||
3417  return table_key.table_id > 0 && need_linearize && fragments.size() > 1;
3418 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
int32_t getNestLevel() const
bool is_dict_encoded_type() const
Definition: sqltypes.h:653
const shared::TableKey & getTableKey() const
#define CHECK_LT(x, y)
Definition: Logger.h:303
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:559
const InputDescriptor & getScanDesc() const
bool is_array() const
Definition: sqltypes.h:583

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

static void Executor::nukeCacheOfExecutors ( )
inlinestatic

Definition at line 505 of file Execute.h.

References execute_mutex_, executors_, and executors_cache_mutex_.

505  {
507  execute_mutex_); // don't want native code to vanish while executing
509  executors_.clear();
510  }
static heavyai::shared_mutex execute_mutex_
Definition: Execute.h:1585
std::unique_lock< T > unique_lock
static std::map< int, std::shared_ptr< Executor > > executors_
Definition: Execute.h:1581
static heavyai::shared_mutex executors_cache_mutex_
Definition: Execute.h:1602
void Executor::nukeOldState ( const bool  allow_lazy_fetch,
const std::vector< InputTableInfo > &  query_infos,
const PlanState::DeletedColumnsMap deleted_cols_map,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 4234 of file Execute.cpp.

References cgen_state_, compilation_queue_time_ms_, RelAlgExecutionUnit::join_quals, kernel_queue_time_ms_, LEFT, and plan_state_.

4237  {
4240  const bool contains_left_deep_outer_join =
4241  ra_exe_unit && std::find_if(ra_exe_unit->join_quals.begin(),
4242  ra_exe_unit->join_quals.end(),
4243  [](const JoinCondition& join_condition) {
4244  return join_condition.type == JoinType::LEFT;
4245  }) != ra_exe_unit->join_quals.end();
4246  cgen_state_.reset(
4247  new CgenState(query_infos.size(), contains_left_deep_outer_join, this));
4248  plan_state_.reset(new PlanState(allow_lazy_fetch && !contains_left_deep_outer_join,
4249  query_infos,
4250  deleted_cols_map,
4251  this));
4252 }
int64_t kernel_queue_time_ms_
Definition: Execute.h:1562
int64_t compilation_queue_time_ms_
Definition: Execute.h:1563
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
unsigned Executor::numBlocksPerMP ( ) const

Definition at line 4327 of file Execute.cpp.

References shared::ceil_div(), cudaMgr(), and grid_size_x_.

4327  {
4328  return std::max((unsigned)2,
4329  shared::ceil_div(grid_size_x_, cudaMgr()->getMinNumMPsForAllDevices()));
4330 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:865
unsigned ceil_div(unsigned const dividend, unsigned const divisor)
Definition: misc.h:329
unsigned grid_size_x_
Definition: Execute.h:1553

+ Here is the call graph for this function:

std::shared_ptr< CompilationContext > Executor::optimizeAndCodegenCPU ( llvm::Function *  query_func,
llvm::Function *  multifrag_query_func,
const std::unordered_set< llvm::Function * > &  live_funcs,
const CompilationOptions co 
)
private

Definition at line 485 of file NativeCodegen.cpp.

References QueryEngine::getInstance(), logger::INFO, CodeGenerator::link_udf_module(), LOG, serialize_llvm_object(), and to_string().

489  {
490  CodeCacheKey key{serialize_llvm_object(query_func),
491  serialize_llvm_object(cgen_state_->row_func_)};
492 
493  llvm::Module* M = query_func->getParent();
494  auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
495  M->getModuleFlag("manage_memory_buffer"));
496  if (flag and flag->getZExtValue() == 1 and M->getFunction("allocate_varlen_buffer") and
497  M->getFunction("register_buffer_with_executor_rsm")) {
498  LOG(INFO) << "including executor addr to cache key\n";
499  key.push_back(std::to_string(reinterpret_cast<int64_t>(this)));
500  }
501  if (cgen_state_->filter_func_) {
502  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
503  }
504  for (const auto helper : cgen_state_->helper_functions_) {
505  key.push_back(serialize_llvm_object(helper));
506  }
507  auto cached_code = QueryEngine::getInstance()->cpu_code_accessor->get_value(key);
508  if (cached_code) {
509  return cached_code;
510  }
511 
512  if (cgen_state_->needs_geos_) {
513 #ifdef ENABLE_GEOS
514  auto llvm_module = multifrag_query_func->getParent();
515  load_geos_dynamic_library();
516 
517  // Read geos runtime module and bind GEOS API function references to GEOS library
518  auto rt_geos_module_copy = llvm::CloneModule(
519  *get_geos_module(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
520  auto func = llvm::dyn_cast<llvm::Function>(gv);
521  if (!func) {
522  return true;
523  }
524  switch (func->getLinkage()) {
525  case llvm::GlobalValue::LinkageTypes::InternalLinkage:
526  case llvm::GlobalValue::LinkageTypes::PrivateLinkage:
527  case llvm::GlobalValue::LinkageTypes::ExternalLinkage:
528  case llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage:
529  return true;
530  default:
531  return false;
532  }
533  });
534  CodeGenerator::link_udf_module(rt_geos_module_copy,
535  *llvm_module,
536  cgen_state_.get(),
537  llvm::Linker::Flags::LinkOnlyNeeded);
538 #else
539  throw std::runtime_error("GEOS is disabled in this build");
540 #endif
541  }
542 
543  auto execution_engine =
544  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
545  auto cpu_compilation_context =
546  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
547  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
548  QueryEngine::getInstance()->cpu_code_accessor->put(key, cpu_compilation_context);
549  return std::dynamic_pointer_cast<CompilationContext>(cpu_compilation_context);
550 }
const std::unique_ptr< llvm::Module > & get_geos_module() const
Definition: Execute.h:545
#define LOG(tag)
Definition: Logger.h:285
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:24
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
std::string to_string(char const *&&v)
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
std::string serialize_llvm_object(const T *llvm_obj)
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:89

+ Here is the call graph for this function:

std::shared_ptr< CompilationContext > Executor::optimizeAndCodegenGPU ( llvm::Function *  query_func,
llvm::Function *  multifrag_query_func,
std::unordered_set< llvm::Function * > &  live_funcs,
const bool  no_inline,
const CudaMgr_Namespace::CudaMgr cuda_mgr,
const bool  is_gpu_smem_used,
const CompilationOptions co 
)
private

Definition at line 1399 of file NativeCodegen.cpp.

1406  {
1407 #ifdef HAVE_CUDA
1408  auto timer = DEBUG_TIMER(__func__);
1409 
1410  CHECK(cuda_mgr);
1411  CodeCacheKey key{serialize_llvm_object(query_func),
1412  serialize_llvm_object(cgen_state_->row_func_)};
1413  if (cgen_state_->filter_func_) {
1414  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
1415  }
1416  for (const auto helper : cgen_state_->helper_functions_) {
1417  key.push_back(serialize_llvm_object(helper));
1418  }
1419  auto cached_code = QueryEngine::getInstance()->gpu_code_accessor->get_value(key);
1420  if (cached_code) {
1421  return cached_code;
1422  }
1423 
1424  bool row_func_not_inlined = false;
1425  if (no_inline) {
1426  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
1427  e = llvm::inst_end(cgen_state_->row_func_);
1428  it != e;
1429  ++it) {
1430  if (llvm::isa<llvm::CallInst>(*it)) {
1431  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1432  auto const func_name = CodegenUtil::getCalledFunctionName(get_gv_call);
1433  if (func_name &&
1434  (*func_name == "array_size" || *func_name == "linear_probabilistic_count")) {
1436  row_func_not_inlined = true;
1437  break;
1438  }
1439  }
1440  }
1441  }
1442 
1444  CodeGenerator::GPUTarget gpu_target{
1445  nvptx_target_machine_.get(), cuda_mgr, cgen_state_.get(), row_func_not_inlined};
1446  std::shared_ptr<GpuCompilationContext> compilation_context;
1447 
1448  try {
1449  compilation_context = CodeGenerator::generateNativeGPUCode(this,
1450  query_func,
1451  multifrag_query_func,
1452  live_funcs,
1453  is_gpu_smem_used,
1454  co,
1455  gpu_target);
1456  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1457  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1458  // Thrown if memory not able to be allocated on gpu
1459  // Retry once after evicting portion of code cache
1460  auto& code_cache_accessor = QueryEngine::getInstance()->gpu_code_accessor;
1461  auto const num_entries_to_evict =
1462  code_cache_accessor->computeNumEntriesToEvict(g_fraction_code_cache_to_evict);
1463  code_cache_accessor->evictEntries(num_entries_to_evict);
1464  compilation_context = CodeGenerator::generateNativeGPUCode(this,
1465  query_func,
1466  multifrag_query_func,
1467  live_funcs,
1468  is_gpu_smem_used,
1469  co,
1470  gpu_target);
1471  } else {
1472  throw;
1473  }
1474  }
1475  QueryEngine::getInstance()->gpu_code_accessor->put(key, compilation_context);
1476  return std::dynamic_pointer_cast<CompilationContext>(compilation_context);
1477 #else
1478  return nullptr;
1479 #endif
1480 }
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
std::unique_ptr< llvm::TargetMachine > nvptx_target_machine_
Definition: Execute.h:1547
void mark_function_never_inline(llvm::Function *func)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:24
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
void initializeNVPTXBackend() const
std::string serialize_llvm_object(const T *llvm_obj)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
float g_fraction_code_cache_to_evict
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:89
void Executor::pause_executor_queue ( )
static

Definition at line 5386 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

Referenced by anonymous_namespace{DBHandler.cpp}::pause_and_resume_executor_queue(), and DBHandler::pause_executor_queue().

5386  {
5388  throw std::runtime_error(
5389  "Executor queue cannot be paused as it requires Executor Resource Manager to be "
5390  "enabled");
5391  }
5392  executor_resource_mgr_->pause_process_queue();
5393 }
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645
bool g_enable_executor_resource_mgr
Definition: Execute.cpp:174

+ Here is the caller graph for this function:

void Executor::preloadFragOffsets ( const std::vector< InputDescriptor > &  input_descs,
const std::vector< InputTableInfo > &  query_infos 
)
private

Definition at line 4254 of file Execute.cpp.

References AUTOMATIC_IR_METADATA, cgen_state_, CHECK_LT, and get_arg_by_name().

4255  {
4257  const auto ld_count = input_descs.size();
4258  auto frag_off_ptr = get_arg_by_name(cgen_state_->row_func_, "frag_row_off");
4259  for (size_t i = 0; i < ld_count; ++i) {
4260  CHECK_LT(i, query_infos.size());
4261  const auto frag_count = query_infos[i].info.fragments.size();
4262  if (i > 0) {
4263  cgen_state_->frag_offsets_.push_back(nullptr);
4264  } else {
4265  if (frag_count > 1) {
4266  cgen_state_->frag_offsets_.push_back(cgen_state_->ir_builder_.CreateLoad(
4267  frag_off_ptr->getType()->getPointerElementType(), frag_off_ptr));
4268  } else {
4269  cgen_state_->frag_offsets_.push_back(nullptr);
4270  }
4271  }
4272  }
4273 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK_LT(x, y)
Definition: Logger.h:303

+ Here is the call graph for this function:

std::vector< llvm::Value * > Executor::prepareRangeModeFuncArgs ( bool  for_start_bound,
const Analyzer::WindowFrame frame_bound,
bool  is_timestamp_type_frame,
llvm::Value *  order_key_null_val,
const WindowFrameBoundFuncArgs frame_args 
) const
private

Definition at line 865 of file WindowFunctionIR.cpp.

References WindowFrameBoundFuncArgs::current_col_value_lv, WindowFrameBoundFuncArgs::frame_end_bound_expr_lv, WindowFrameBoundFuncArgs::frame_start_bound_expr_lv, WindowFrameBoundFuncArgs::int64_t_zero_val_lv, Analyzer::WindowFrame::isCurrentRowBound(), WindowFrameBoundFuncArgs::null_end_pos_lv, WindowFrameBoundFuncArgs::null_start_pos_lv, WindowFrameBoundFuncArgs::nulls_first_lv, WindowFrameBoundFuncArgs::num_elem_current_partition_lv, WindowFrameBoundFuncArgs::order_key_buf_ptr_lv, WindowFrameBoundFuncArgs::target_partition_rowid_ptr_lv, and WindowFrameBoundFuncArgs::target_partition_sorted_rowid_ptr_lv.

870  {
871  llvm::Value* bound_expr_lv =
872  for_start_bound ? args.frame_start_bound_expr_lv : args.frame_end_bound_expr_lv;
873  llvm::Value* target_val_lv =
874  frame_bound->isCurrentRowBound() || !is_timestamp_type_frame
875  ? args.current_col_value_lv
876  : bound_expr_lv;
877  llvm::Value* frame_bound_val_lv =
878  frame_bound->isCurrentRowBound() || is_timestamp_type_frame
879  ? args.int64_t_zero_val_lv
880  : bound_expr_lv;
881  std::vector<llvm::Value*> frame_args{args.num_elem_current_partition_lv,
882  target_val_lv,
883  args.order_key_buf_ptr_lv,
884  args.target_partition_rowid_ptr_lv,
885  args.target_partition_sorted_rowid_ptr_lv,
886  frame_bound_val_lv,
887  order_key_null_val,
888  args.nulls_first_lv,
889  args.null_start_pos_lv,
890  args.null_end_pos_lv};
891  return frame_args;
892 }
bool isCurrentRowBound() const
Definition: Analyzer.h:2710

+ Here is the call graph for this function:

std::vector< llvm::Value * > Executor::prepareRowModeFuncArgs ( bool  for_start_bound,
SqlWindowFrameBoundType  bound_type,
const WindowFrameBoundFuncArgs args 
) const
private

Definition at line 847 of file WindowFunctionIR.cpp.

References WindowFrameBoundFuncArgs::current_partition_start_offset_lv, CURRENT_ROW, WindowFrameBoundFuncArgs::current_row_pos_lv, EXPR_FOLLOWING, WindowFrameBoundFuncArgs::frame_end_bound_expr_lv, WindowFrameBoundFuncArgs::frame_start_bound_expr_lv, WindowFrameBoundFuncArgs::int64_t_zero_val_lv, and WindowFrameBoundFuncArgs::num_elem_current_partition_lv.

850  {
851  std::vector<llvm::Value*> frame_args{args.current_row_pos_lv,
853  if (bound_type == SqlWindowFrameBoundType::CURRENT_ROW) {
854  frame_args.push_back(args.int64_t_zero_val_lv);
855  } else {
856  frame_args.push_back(for_start_bound ? args.frame_start_bound_expr_lv
857  : args.frame_end_bound_expr_lv);
858  if (bound_type == SqlWindowFrameBoundType::EXPR_FOLLOWING) {
859  frame_args.push_back(args.num_elem_current_partition_lv);
860  }
861  }
862  return frame_args;
863 }
llvm::Value * num_elem_current_partition_lv
Definition: WindowContext.h:95
llvm::Value * current_row_pos_lv
Definition: WindowContext.h:90
llvm::Value * frame_end_bound_expr_lv
Definition: WindowContext.h:89
llvm::Value * current_partition_start_offset_lv
Definition: WindowContext.h:92
llvm::Value * int64_t_zero_val_lv
Definition: WindowContext.h:93
llvm::Value * frame_start_bound_expr_lv
Definition: WindowContext.h:88
void Executor::redeclareFilterFunction ( )
private

Definition at line 1086 of file IRCodegen.cpp.

References CHECK, CHECK_EQ, get_int_type(), and to_string().

1086  {
1087  if (!cgen_state_->filter_func_) {
1088  return;
1089  }
1090 
1091  // Loop over all the instructions used in the filter func.
1092  // The filter func instructions were generated as if for row func.
1093  // Remap any values used by those instructions to filter func args
1094  // and remember to forward them through the call in the row func.
1095  for (auto bb_it = cgen_state_->filter_func_->begin();
1096  bb_it != cgen_state_->filter_func_->end();
1097  ++bb_it) {
1098  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
1099  size_t i = 0;
1100  for (auto op_it = instr_it->value_op_begin(); op_it != instr_it->value_op_end();
1101  ++op_it, ++i) {
1102  llvm::Value* v = *op_it;
1103 
1104  // The last LLVM operand on a call instruction is the function to be called. Never
1105  // remap it.
1106  if (llvm::dyn_cast<const llvm::CallInst>(instr_it) &&
1107  op_it == instr_it->value_op_end() - 1) {
1108  continue;
1109  }
1110 
1111  CHECK(v);
1112  if (auto* instr = llvm::dyn_cast<llvm::Instruction>(v);
1113  instr && instr->getParent() &&
1114  instr->getParent()->getParent() == cgen_state_->row_func_) {
1115  // Remember that this filter func arg is needed.
1116  cgen_state_->filter_func_args_[v] = nullptr;
1117  } else if (auto* argum = llvm::dyn_cast<llvm::Argument>(v);
1118  argum && argum->getParent() == cgen_state_->row_func_) {
1119  // Remember that this filter func arg is needed.
1120  cgen_state_->filter_func_args_[v] = nullptr;
1121  }
1122  }
1123  }
1124  }
1125 
1126  // Create filter_func2 with parameters only for those row func values that are known to
1127  // be used in the filter func code.
1128  std::vector<llvm::Type*> filter_func_arg_types;
1129  filter_func_arg_types.reserve(cgen_state_->filter_func_args_.v_.size());
1130  for (auto& arg : cgen_state_->filter_func_args_.v_) {
1131  filter_func_arg_types.push_back(arg->getType());
1132  }
1133  auto ft = llvm::FunctionType::get(
1134  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
1135  cgen_state_->filter_func_->setName("old_filter_func");
1136  auto filter_func2 = llvm::Function::Create(ft,
1137  llvm::Function::ExternalLinkage,
1138  "filter_func",
1139  cgen_state_->filter_func_->getParent());
1140  CHECK_EQ(filter_func2->arg_size(), cgen_state_->filter_func_args_.v_.size());
1141  auto arg_it = cgen_state_->filter_func_args_.begin();
1142  size_t i = 0;
1143  for (llvm::Function::arg_iterator I = filter_func2->arg_begin(),
1144  E = filter_func2->arg_end();
1145  I != E;
1146  ++I, ++arg_it) {
1147  arg_it->second = &*I;
1148  if (arg_it->first->hasName()) {
1149  I->setName(arg_it->first->getName());
1150  } else {
1151  I->setName("extra" + std::to_string(i++));
1152  }
1153  }
1154 
1155  // copy the filter_func function body over
1156  // see
1157  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
1158  filter_func2->getBasicBlockList().splice(
1159  filter_func2->begin(), cgen_state_->filter_func_->getBasicBlockList());
1160 
1161  if (cgen_state_->current_func_ == cgen_state_->filter_func_) {
1162  cgen_state_->current_func_ = filter_func2;
1163  }
1164  cgen_state_->filter_func_ = filter_func2;
1165 
1166  // loop over all the operands in the filter func
1167  for (auto bb_it = cgen_state_->filter_func_->begin();
1168  bb_it != cgen_state_->filter_func_->end();
1169  ++bb_it) {
1170  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
1171  size_t i = 0;
1172  for (auto op_it = instr_it->op_begin(); op_it != instr_it->op_end(); ++op_it, ++i) {
1173  llvm::Value* v = op_it->get();
1174  if (auto arg_it = cgen_state_->filter_func_args_.find(v);
1175  arg_it != cgen_state_->filter_func_args_.end()) {
1176  // replace row func value with a filter func arg
1177  llvm::Use* use = &*op_it;
1178  use->set(arg_it->second);
1179  }
1180  }
1181  }
1182  }
1183 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

ResultSetPtr Executor::reduceMultiDeviceResults ( const RelAlgExecutionUnit ra_exe_unit,
std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &  all_fragment_results,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const QueryMemoryDescriptor query_mem_desc 
) const
private

Definition at line 1564 of file Execute.cpp.

References blockSize(), CPU, DEBUG_TIMER, RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), getUniqueThreadSharedResultSets(), gridSize(), QueryMemoryDescriptor, reduce_estimator_results(), reduceMultiDeviceResultSets(), RelAlgExecutionUnit::target_exprs, and QueryMemoryDescriptor::threadsCanReuseGroupByBuffers().

Referenced by collectAllDeviceResults().

1568  {
1569  auto timer = DEBUG_TIMER(__func__);
1570  if (ra_exe_unit.estimator) {
1571  return reduce_estimator_results(ra_exe_unit, results_per_device);
1572  }
1573 
1574  if (results_per_device.empty()) {
1575  auto const targets = shared::transform<std::vector<TargetInfo>>(
1576  ra_exe_unit.target_exprs, GetTargetInfo{});
1577  return std::make_shared<ResultSet>(targets,
1580  nullptr,
1581  blockSize(),
1582  gridSize());
1583  }
1584 
1585  if (query_mem_desc.threadsCanReuseGroupByBuffers()) {
1586  auto unique_results = getUniqueThreadSharedResultSets(results_per_device);
1588  unique_results,
1589  row_set_mem_owner,
1590  ResultSet::fixupQueryMemoryDescriptor(query_mem_desc));
1591  }
1593  results_per_device,
1594  row_set_mem_owner,
1595  ResultSet::fixupQueryMemoryDescriptor(query_mem_desc));
1596 }
std::vector< Analyzer::Expr * > target_exprs
bool threadsCanReuseGroupByBuffers() const
friend class QueryMemoryDescriptor
Definition: Execute.h:1658
const std::shared_ptr< Analyzer::Estimator > estimator
unsigned gridSize() const
Definition: Execute.cpp:4318
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
ResultSetPtr reduceMultiDeviceResultSets(std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &all_fragment_results, std::shared_ptr< RowSetMemoryOwner >, const QueryMemoryDescriptor &) const
Definition: Execute.cpp:1639
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > getUniqueThreadSharedResultSets(const std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &results_per_device) const
Definition: Execute.cpp:1599
#define DEBUG_TIMER(name)
Definition: Logger.h:412
ResultSetPtr reduce_estimator_results(const RelAlgExecutionUnit &ra_exe_unit, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &results_per_device)
unsigned blockSize() const
Definition: Execute.cpp:4332

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ResultSetPtr Executor::reduceMultiDeviceResultSets ( std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &  all_fragment_results,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const QueryMemoryDescriptor query_mem_desc 
) const
private

Definition at line 1639 of file Execute.cpp.

References gpu_enabled::accumulate(), blockSize(), CHECK, CPU, DEBUG_TIMER, executor_id_, anonymous_namespace{Execute.cpp}::get_reduction_code(), QueryMemoryDescriptor::getQueryDescriptionType(), gridSize(), GroupByBaselineHash, logger::init(), plan_state_, query_mem_desc, and QueryMemoryDescriptor::setEntryCount().

Referenced by reduceMultiDeviceResults().

1642  {
1643  auto timer = DEBUG_TIMER(__func__);
1644  std::shared_ptr<ResultSet> reduced_results;
1645 
1646  const auto& first = results_per_device.front().first;
1647 
1648  if (query_mem_desc.getQueryDescriptionType() ==
1650  results_per_device.size() > 1) {
1651  const auto total_entry_count = std::accumulate(
1652  results_per_device.begin(),
1653  results_per_device.end(),
1654  size_t(0),
1655  [](const size_t init, const std::pair<ResultSetPtr, std::vector<size_t>>& rs) {
1656  const auto& r = rs.first;
1657  return init + r->getQueryMemDesc().getEntryCount();
1658  });
1659  CHECK(total_entry_count);
1660  auto query_mem_desc = first->getQueryMemDesc();
1661  query_mem_desc.setEntryCount(total_entry_count);
1662  reduced_results = std::make_shared<ResultSet>(first->getTargetInfos(),
1665  row_set_mem_owner,
1666  blockSize(),
1667  gridSize());
1668  auto result_storage = reduced_results->allocateStorage(plan_state_->init_agg_vals_);
1669  reduced_results->initializeStorage();
1670  switch (query_mem_desc.getEffectiveKeyWidth()) {
1671  case 4:
1672  first->getStorage()->moveEntriesToBuffer<int32_t>(
1673  result_storage->getUnderlyingBuffer(), query_mem_desc.getEntryCount());
1674  break;
1675  case 8:
1676  first->getStorage()->moveEntriesToBuffer<int64_t>(
1677  result_storage->getUnderlyingBuffer(), query_mem_desc.getEntryCount());
1678  break;
1679  default:
1680  CHECK(false);
1681  }
1682  } else {
1683  reduced_results = first;
1684  }
1685 
1686  int64_t compilation_queue_time = 0;
1687  const auto reduction_code =
1688  get_reduction_code(executor_id_, results_per_device, &compilation_queue_time);
1689 
1690  for (size_t i = 1; i < results_per_device.size(); ++i) {
1691  reduced_results->getStorage()->reduce(
1692  *(results_per_device[i].first->getStorage()), {}, reduction_code, executor_id_);
1693  }
1694  reduced_results->addCompilationQueueTime(compilation_queue_time);
1695  reduced_results->invalidateCachedRowCount();
1696  return reduced_results;
1697 }
void setEntryCount(const size_t val)
const ExecutorId executor_id_
Definition: Execute.h:1476
void init(LogOptions const &log_opts)
Definition: Logger.cpp:364
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
QueryDescriptionType getQueryDescriptionType() const
ReductionCode get_reduction_code(const size_t executor_id, std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &results_per_device, int64_t *compilation_queue_time)
Definition: Execute.cpp:1622
unsigned gridSize() const
Definition: Execute.cpp:4318
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
unsigned blockSize() const
Definition: Execute.cpp:4332

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::pair< int64_t, int32_t > Executor::reduceResults ( const SQLAgg  agg,
const SQLTypeInfo ti,
const int64_t  agg_init_val,
const int8_t  out_byte_width,
const int64_t *  out_vec,
const size_t  out_vec_sz,
const bool  is_group_by,
const bool  float_argument_input 
)
static

Definition at line 1312 of file Execute.cpp.

References agg_max_double_skip_val(), agg_max_float_skip_val(), agg_max_skip_val(), agg_min_double_skip_val(), agg_min_float_skip_val(), agg_min_skip_val(), agg_sum_double_skip_val(), agg_sum_float_skip_val(), agg_sum_skip_val(), CHECK, ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES, float_to_double_bin(), SQLTypeInfo::get_notnull(), SQLTypeInfo::is_boolean(), SQLTypeInfo::is_decimal(), SQLTypeInfo::is_fp(), SQLTypeInfo::is_integer(), SQLTypeInfo::is_time(), kAVG, kCOUNT, kCOUNT_IF, kMAX, kMIN, kSAMPLE, kSINGLE_VALUE, kSUM, kSUM_IF, and UNREACHABLE.

Referenced by executePlanWithoutGroupBy().

1319  {
1320  switch (agg) {
1321  case kAVG:
1322  case kSUM:
1323  case kSUM_IF:
1324  if (0 != agg_init_val) {
1325  if (ti.is_integer() || ti.is_decimal() || ti.is_time() || ti.is_boolean()) {
1326  int64_t agg_result = agg_init_val;
1327  for (size_t i = 0; i < out_vec_sz; ++i) {
1328  agg_sum_skip_val(&agg_result, out_vec[i], agg_init_val);
1329  }
1330  return {agg_result, 0};
1331  } else {
1332  CHECK(ti.is_fp());
1333  switch (out_byte_width) {
1334  case 4: {
1335  int agg_result = static_cast<int32_t>(agg_init_val);
1336  for (size_t i = 0; i < out_vec_sz; ++i) {
1338  &agg_result,
1339  *reinterpret_cast<const float*>(may_alias_ptr(&out_vec[i])),
1340  *reinterpret_cast<const float*>(may_alias_ptr(&agg_init_val)));
1341  }
1342  const int64_t converted_bin =
1343  float_argument_input
1344  ? static_cast<int64_t>(agg_result)
1345  : float_to_double_bin(static_cast<int32_t>(agg_result), true);
1346  return {converted_bin, 0};
1347  break;
1348  }
1349  case 8: {
1350  int64_t agg_result = agg_init_val;
1351  for (size_t i = 0; i < out_vec_sz; ++i) {
1353  &agg_result,
1354  *reinterpret_cast<const double*>(may_alias_ptr(&out_vec[i])),
1355  *reinterpret_cast<const double*>(may_alias_ptr(&agg_init_val)));
1356  }
1357  return {agg_result, 0};
1358  break;
1359  }
1360  default:
1361  CHECK(false);
1362  }
1363  }
1364  }
1365  if (ti.is_integer() || ti.is_decimal() || ti.is_time()) {
1366  int64_t agg_result = 0;
1367  for (size_t i = 0; i < out_vec_sz; ++i) {
1368  agg_result += out_vec[i];
1369  }
1370  return {agg_result, 0};
1371  } else {
1372  CHECK(ti.is_fp());
1373  switch (out_byte_width) {
1374  case 4: {
1375  float r = 0.;
1376  for (size_t i = 0; i < out_vec_sz; ++i) {
1377  r += *reinterpret_cast<const float*>(may_alias_ptr(&out_vec[i]));
1378  }
1379  const auto float_bin = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
1380  const int64_t converted_bin =
1381  float_argument_input ? float_bin : float_to_double_bin(float_bin, true);
1382  return {converted_bin, 0};
1383  }
1384  case 8: {
1385  double r = 0.;
1386  for (size_t i = 0; i < out_vec_sz; ++i) {
1387  r += *reinterpret_cast<const double*>(may_alias_ptr(&out_vec[i]));
1388  }
1389  return {*reinterpret_cast<const int64_t*>(may_alias_ptr(&r)), 0};
1390  }
1391  default:
1392  CHECK(false);
1393  }
1394  }
1395  break;
1396  case kCOUNT:
1397  case kCOUNT_IF: {
1398  uint64_t agg_result = 0;
1399  for (size_t i = 0; i < out_vec_sz; ++i) {
1400  const uint64_t out = static_cast<uint64_t>(out_vec[i]);
1401  agg_result += out;
1402  }
1403  return {static_cast<int64_t>(agg_result), 0};
1404  }
1405  case kMIN: {
1406  if (ti.is_integer() || ti.is_decimal() || ti.is_time() || ti.is_boolean()) {
1407  int64_t agg_result = agg_init_val;
1408  for (size_t i = 0; i < out_vec_sz; ++i) {
1409  agg_min_skip_val(&agg_result, out_vec[i], agg_init_val);
1410  }
1411  return {agg_result, 0};
1412  } else {
1413  switch (out_byte_width) {
1414  case 4: {
1415  int32_t agg_result = static_cast<int32_t>(agg_init_val);
1416  for (size_t i = 0; i < out_vec_sz; ++i) {
1418  &agg_result,
1419  *reinterpret_cast<const float*>(may_alias_ptr(&out_vec[i])),
1420  *reinterpret_cast<const float*>(may_alias_ptr(&agg_init_val)));
1421  }
1422  const int64_t converted_bin =
1423  float_argument_input
1424  ? static_cast<int64_t>(agg_result)
1425  : float_to_double_bin(static_cast<int32_t>(agg_result), true);
1426  return {converted_bin, 0};
1427  }
1428  case 8: {
1429  int64_t agg_result = agg_init_val;
1430  for (size_t i = 0; i < out_vec_sz; ++i) {
1432  &agg_result,
1433  *reinterpret_cast<const double*>(may_alias_ptr(&out_vec[i])),
1434  *reinterpret_cast<const double*>(may_alias_ptr(&agg_init_val)));
1435  }
1436  return {agg_result, 0};
1437  }
1438  default:
1439  CHECK(false);
1440  }
1441  }
1442  }
1443  case kMAX:
1444  if (ti.is_integer() || ti.is_decimal() || ti.is_time() || ti.is_boolean()) {
1445  int64_t agg_result = agg_init_val;
1446  for (size_t i = 0; i < out_vec_sz; ++i) {
1447  agg_max_skip_val(&agg_result, out_vec[i], agg_init_val);
1448  }
1449  return {agg_result, 0};
1450  } else {
1451  switch (out_byte_width) {
1452  case 4: {
1453  int32_t agg_result = static_cast<int32_t>(agg_init_val);
1454  for (size_t i = 0; i < out_vec_sz; ++i) {
1456  &agg_result,
1457  *reinterpret_cast<const float*>(may_alias_ptr(&out_vec[i])),
1458  *reinterpret_cast<const float*>(may_alias_ptr(&agg_init_val)));
1459  }
1460  const int64_t converted_bin =
1461  float_argument_input ? static_cast<int64_t>(agg_result)
1462  : float_to_double_bin(agg_result, !ti.get_notnull());
1463  return {converted_bin, 0};
1464  }
1465  case 8: {
1466  int64_t agg_result = agg_init_val;
1467  for (size_t i = 0; i < out_vec_sz; ++i) {
1469  &agg_result,
1470  *reinterpret_cast<const double*>(may_alias_ptr(&out_vec[i])),
1471  *reinterpret_cast<const double*>(may_alias_ptr(&agg_init_val)));
1472  }
1473  return {agg_result, 0};
1474  }
1475  default:
1476  CHECK(false);
1477  }
1478  }
1479  case kSINGLE_VALUE: {
1480  int64_t agg_result = agg_init_val;
1481  for (size_t i = 0; i < out_vec_sz; ++i) {
1482  if (out_vec[i] != agg_init_val) {
1483  if (agg_result == agg_init_val) {
1484  agg_result = out_vec[i];
1485  } else if (out_vec[i] != agg_result) {
1487  }
1488  }
1489  }
1490  return {agg_result, 0};
1491  }
1492  case kSAMPLE: {
1493  int64_t agg_result = agg_init_val;
1494  for (size_t i = 0; i < out_vec_sz; ++i) {
1495  if (out_vec[i] != agg_init_val) {
1496  agg_result = out_vec[i];
1497  break;
1498  }
1499  }
1500  return {agg_result, 0};
1501  }
1502  default:
1503  UNREACHABLE() << "Unsupported SQLAgg: " << agg;
1504  }
1505  abort();
1506 }
int64_t float_to_double_bin(int32_t val, bool nullable=false)
bool is_fp() const
Definition: sqltypes.h:571
#define UNREACHABLE()
Definition: Logger.h:338
bool is_time() const
Definition: sqltypes.h:577
RUNTIME_EXPORT void agg_sum_float_skip_val(int32_t *agg, const float val, const float skip_val)
Definition: sqldefs.h:75
RUNTIME_EXPORT void agg_sum_double_skip_val(int64_t *agg, const double val, const double skip_val)
bool is_integer() const
Definition: sqltypes.h:565
RUNTIME_EXPORT void agg_max_double_skip_val(int64_t *agg, const double val, const double skip_val)
Definition: sqldefs.h:77
bool is_boolean() const
Definition: sqltypes.h:580
static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
Definition: Execute.h:1628
RUNTIME_EXPORT void agg_min_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
Definition: sqldefs.h:78
RUNTIME_EXPORT void agg_min_double_skip_val(int64_t *agg, const double val, const double skip_val)
#define CHECK(condition)
Definition: Logger.h:291
RUNTIME_EXPORT void agg_max_float_skip_val(int32_t *agg, const float val, const float skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
RUNTIME_EXPORT void agg_min_float_skip_val(int32_t *agg, const float val, const float skip_val)
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:398
Definition: sqldefs.h:76
RUNTIME_EXPORT void agg_max_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
bool is_decimal() const
Definition: sqltypes.h:568
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ResultSetPtr Executor::reduceSpeculativeTopN ( const RelAlgExecutionUnit ra_exe_unit,
std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &  all_fragment_results,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const QueryMemoryDescriptor query_mem_desc 
) const
private

Definition at line 1699 of file Execute.cpp.

References SpeculativeTopNMap::asRows(), CHECK, CHECK_EQ, SortInfo::limit, SortInfo::offset, SortInfo::order_entries, SpeculativeTopNMap::reduce(), run_benchmark_import::result, report::rows, RelAlgExecutionUnit::sort_info, and RelAlgExecutionUnit::target_exprs.

Referenced by collectAllDeviceResults().

1703  {
1704  if (results_per_device.size() == 1) {
1705  return std::move(results_per_device.front().first);
1706  }
1707  const auto top_n =
1708  ra_exe_unit.sort_info.limit.value_or(0) + ra_exe_unit.sort_info.offset;
1710  for (const auto& result : results_per_device) {
1711  auto rows = result.first;
1712  CHECK(rows);
1713  if (!rows) {
1714  continue;
1715  }
1716  SpeculativeTopNMap that(
1717  *rows,
1718  ra_exe_unit.target_exprs,
1719  std::max(size_t(10000 * std::max(1, static_cast<int>(log(top_n)))), top_n));
1720  m.reduce(that);
1721  }
1722  CHECK_EQ(size_t(1), ra_exe_unit.sort_info.order_entries.size());
1723  const auto desc = ra_exe_unit.sort_info.order_entries.front().is_desc;
1724  return m.asRows(ra_exe_unit, row_set_mem_owner, query_mem_desc, this, top_n, desc);
1725 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void reduce(SpeculativeTopNMap &that)
tuple rows
Definition: report.py:114
std::optional< size_t > limit
std::list< Analyzer::OrderEntry > order_entries
std::shared_ptr< ResultSet > asRows(const RelAlgExecutionUnit &ra_exe_unit, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const QueryMemoryDescriptor &query_mem_desc, const Executor *executor, const size_t top_n, const bool desc) const
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::registerActiveModule ( void *  module,
const int  device_id 
)
static

Definition at line 20 of file GpuInterrupt.cpp.

References CHECK_LT, to_string(), and VLOG.

20  {
21 #ifdef HAVE_CUDA
22  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
23  CHECK_LT(device_id, max_gpu_count);
24  gpu_active_modules_device_mask_ |= (1 << device_id);
25  gpu_active_modules_[device_id] = module;
26  VLOG(1) << "Registered module " << module << " on device " << std::to_string(device_id);
27 #endif
28 }
static const int max_gpu_count
Definition: Execute.h:1535
static void * gpu_active_modules_[max_gpu_count]
Definition: Execute.h:1541
static uint32_t gpu_active_modules_device_mask_
Definition: Execute.h:1540
std::string to_string(char const *&&v)
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define VLOG(n)
Definition: Logger.h:388
static std::mutex gpu_active_modules_mutex_
Definition: Execute.h:1539

+ Here is the call graph for this function:

template<typename F >
static void Executor::registerExtensionFunctions ( register_extension_functions)
inlinestatic

Definition at line 470 of file Execute.h.

References execute_mutex_, executors_, executors_cache_mutex_, register_runtime_extension_functions_mutex_, and update_after_registration().

Referenced by DBHandler::register_runtime_extension_functions().

470  {
471  // Don't want native code to vanish while executing:
473  // Blocks Executor::getExecutor:
475  // Lock registration to avoid
476  // java.util.ConcurrentModificationException from calcite server
477  // when client registrations arrive too fast. Also blocks
478  // Executor::get_rt_udf_module for retrieving runtime UDF/UDTF
479  // module until this registration has rebuild it via
480  // Executor::update_after_registration:
481  std::lock_guard<std::mutex> register_lock(
483 
484  // Reset all executors:
485  for (auto& executor_item : Executor::executors_) {
486  executor_item.second->reset(/*discard_runtime_modules_only=*/true);
487  }
488  // Call registration worker, see
489  // DBHandler::register_runtime_extension_functions for details. In
490  // short, updates Executor::extension_module_sources,
491  // table_functions::TableFunctionsFactory, and registers runtime
492  // extension functions with Calcite:
493  register_extension_functions();
494 
495  // Update executors with registered LLVM modules:
496  update_after_registration(/*update_runtime_modules_only=*/true);
497  }
static heavyai::shared_mutex execute_mutex_
Definition: Execute.h:1585
std::unique_lock< T > unique_lock
static std::map< int, std::shared_ptr< Executor > > executors_
Definition: Execute.h:1581
static void update_after_registration(bool update_runtime_modules_only=false)
Definition: Execute.h:1420
static std::mutex register_runtime_extension_functions_mutex_
Definition: Execute.h:1640
static heavyai::shared_mutex executors_cache_mutex_
Definition: Execute.h:1602

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::registerExtractedQueryPlanDag ( const QueryPlanDAG query_plan_dag)

Definition at line 5342 of file Execute.cpp.

References latest_query_plan_extracted_.

5342  {
5343  // this function is called under the recycler lock
5344  // e.g., QueryPlanDagExtractor::extractQueryPlanDagImpl()
5345  latest_query_plan_extracted_ = query_plan_dag;
5346 }
static QueryPlanDAG latest_query_plan_extracted_
Definition: Execute.h:1612
bool Executor::removeFromQuerySessionList ( const QuerySessionId query_session,
const std::string &  submitted_time_str,
heavyai::unique_lock< heavyai::shared_mutex > &  write_lock 
)

Definition at line 5175 of file Execute.cpp.

References executor_id_, interrupted_, queries_interrupt_flag_, and queries_session_map_.

Referenced by clearQuerySessionStatus().

5178  {
5179  if (query_session.empty()) {
5180  return false;
5181  }
5182  if (queries_session_map_.count(query_session)) {
5183  auto& storage = queries_session_map_.at(query_session);
5184  if (storage.size() > 1) {
5185  // in this case we only remove query executor info
5186  for (auto it = storage.begin(); it != storage.end(); it++) {
5187  auto target_submitted_t_str = it->second.getQuerySubmittedTime();
5188  // no time difference && have the same executor id--> found the target query
5189  if (it->second.getExecutorId() == executor_id_ &&
5190  submitted_time_str.compare(target_submitted_t_str) == 0) {
5191  storage.erase(it);
5192  return true;
5193  }
5194  }
5195  } else if (storage.size() == 1) {
5196  // here this session only has a single query executor
5197  // so we clear both executor info and its interrupt flag
5198  queries_session_map_.erase(query_session);
5199  queries_interrupt_flag_.erase(query_session);
5200  if (interrupted_.load()) {
5201  interrupted_.store(false);
5202  }
5203  return true;
5204  }
5205  }
5206  return false;
5207 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580
std::atomic< bool > interrupted_
Definition: Execute.h:1543
const ExecutorId executor_id_
Definition: Execute.h:1476
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1578

+ Here is the caller graph for this function:

void Executor::reset ( bool  discard_runtime_modules_only = false)

Definition at line 323 of file Execute.cpp.

References QueryEngine::getInstance(), rt_udf_cpu_module, and rt_udf_gpu_module.

323  {
324  // TODO: keep cached results that do not depend on runtime UDF/UDTFs
325  auto qe = QueryEngine::getInstance();
326  qe->s_code_accessor->clear();
327  qe->s_stubs_accessor->clear();
328  qe->cpu_code_accessor->clear();
329  qe->gpu_code_accessor->clear();
330  qe->tf_code_accessor->clear();
331 
332  if (discard_runtime_modules_only) {
334 #ifdef HAVE_CUDA
336 #endif
337  cgen_state_->module_ = nullptr;
338  } else {
339  extension_modules_.clear();
340  cgen_state_.reset();
341  context_.reset(new llvm::LLVMContext());
342  cgen_state_.reset(new CgenState({}, false, this));
343  }
344 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
std::unique_ptr< llvm::LLVMContext > context_
Definition: Execute.h:1477
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:89
std::map< ExtModuleKinds, std::unique_ptr< llvm::Module > > extension_modules_
Definition: Execute.h:1517

+ Here is the call graph for this function:

void Executor::resetBlockSize ( )

Definition at line 4354 of file Execute.cpp.

References block_size_x_.

4354  {
4355  block_size_x_ = 0;
4356 }
unsigned block_size_x_
Definition: Execute.h:1552
void Executor::resetGridSize ( )

Definition at line 4346 of file Execute.cpp.

References grid_size_x_.

4346  {
4347  grid_size_x_ = 0;
4348 }
unsigned grid_size_x_
Definition: Execute.h:1553
void Executor::resetInterrupt ( )

Definition at line 216 of file GpuInterrupt.cpp.

References check_interrupt_init(), DW_RESET, dynamic_watchdog_init(), g_enable_dynamic_watchdog, g_enable_non_kernel_time_query_interrupt, g_enable_runtime_query_interrupt, INT_RESET, unregisterActiveModule(), and VLOG.

Referenced by clearQuerySessionStatus().

216  {
217  const auto allow_interrupt =
220  dynamic_watchdog_init(static_cast<unsigned>(DW_RESET));
221  } else if (allow_interrupt) {
222 #ifdef HAVE_CUDA
223  for (int device_id = 0; device_id < max_gpu_count; device_id++) {
225  }
226 #endif
227  VLOG(1) << "Reset interrupt flag for CPU execution kernel on Executor "
228  << executor_id_;
229  check_interrupt_init(static_cast<unsigned>(INT_RESET));
230  }
231 
232  if (interrupted_.load()) {
233  VLOG(1) << "RESET Executor " << executor_id_
234  << " that had previously been interrupted";
235  interrupted_.store(false);
236  }
237 }
std::atomic< bool > interrupted_
Definition: Execute.h:1543
static const int max_gpu_count
Definition: Execute.h:1535
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:81
bool g_enable_non_kernel_time_query_interrupt
Definition: Execute.cpp:134
const ExecutorId executor_id_
Definition: Execute.h:1476
static void unregisterActiveModule(const int device_id)
RUNTIME_EXPORT uint64_t dynamic_watchdog_init(unsigned ms_budget)
RUNTIME_EXPORT bool check_interrupt_init(unsigned command)
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:133
#define VLOG(n)
Definition: Logger.h:388

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ResultSetPtr Executor::resultsUnion ( SharedKernelContext shared_context,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 1538 of file Execute.cpp.

References blockSize(), CHECK_GE, CPU, DEBUG_TIMER, anonymous_namespace{Execute.cpp}::get_merged_result(), SharedKernelContext::getFragmentResults(), gridSize(), QueryMemoryDescriptor, row_set_mem_owner_, gpu_enabled::sort(), and RelAlgExecutionUnit::target_exprs.

Referenced by executeWorkUnitImpl().

1539  {
1540  auto timer = DEBUG_TIMER(__func__);
1541  auto& results_per_device = shared_context.getFragmentResults();
1542  auto const targets = shared::transform<std::vector<TargetInfo>>(
1543  ra_exe_unit.target_exprs, GetTargetInfo{});
1544  if (results_per_device.empty()) {
1545  return std::make_shared<ResultSet>(targets,
1549  blockSize(),
1550  gridSize());
1551  }
1552  using IndexedResultSet = std::pair<ResultSetPtr, std::vector<size_t>>;
1553  std::sort(results_per_device.begin(),
1554  results_per_device.end(),
1555  [](const IndexedResultSet& lhs, const IndexedResultSet& rhs) {
1556  CHECK_GE(lhs.second.size(), size_t(1));
1557  CHECK_GE(rhs.second.size(), size_t(1));
1558  return lhs.second.front() < rhs.second.front();
1559  });
1560 
1561  return get_merged_result(results_per_device, targets);
1562 }
std::vector< Analyzer::Expr * > target_exprs
ResultSetPtr get_merged_result(std::vector< std::pair< ResultSetPtr, std::vector< size_t >>> &results_per_device, std::vector< TargetInfo > const &targets)
Definition: Execute.cpp:1510
DEVICE void sort(ARGS &&...args)
Definition: gpu_enabled.h:105
#define CHECK_GE(x, y)
Definition: Logger.h:306
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:1533
friend class QueryMemoryDescriptor
Definition: Execute.h:1658
unsigned gridSize() const
Definition: Execute.cpp:4318
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define DEBUG_TIMER(name)
Definition: Logger.h:412
unsigned blockSize() const
Definition: Execute.cpp:4332

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::resume_executor_queue ( )
static

Definition at line 5395 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

Referenced by anonymous_namespace{DBHandler.cpp}::pause_and_resume_executor_queue(), and DBHandler::resume_executor_queue().

5395  {
5397  throw std::runtime_error(
5398  "Executor queue cannot be resumed as it requires Executor Resource Manager to be "
5399  "enabled");
5400  }
5401  executor_resource_mgr_->resume_process_queue();
5402 }
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645
bool g_enable_executor_resource_mgr
Definition: Execute.cpp:174

+ Here is the caller graph for this function:

std::vector< int8_t > Executor::serializeLiterals ( const std::unordered_map< int, CgenState::LiteralValues > &  literals,
const int  device_id 
)
private

Definition at line 1035 of file Execute.cpp.

References CgenState::addAligned(), align(), CHECK, CHECK_EQ, CHECK_LE, g_enable_string_functions, StringDictionaryProxy::getIdOfString(), StringDictionaryProxy::getOrAddTransient(), getStringDictionaryProxy(), CgenState::literalBytes(), and row_set_mem_owner_.

Referenced by executePlanWithGroupBy(), and executePlanWithoutGroupBy().

1037  {
1038  if (literals.empty()) {
1039  return {};
1040  }
1041  const auto dev_literals_it = literals.find(device_id);
1042  CHECK(dev_literals_it != literals.end());
1043  const auto& dev_literals = dev_literals_it->second;
1044  size_t lit_buf_size{0};
1045  std::vector<std::string> real_strings;
1046  std::vector<std::vector<double>> double_array_literals;
1047  std::vector<std::vector<int8_t>> align64_int8_array_literals;
1048  std::vector<std::vector<int32_t>> int32_array_literals;
1049  std::vector<std::vector<int8_t>> align32_int8_array_literals;
1050  std::vector<std::vector<int8_t>> int8_array_literals;
1051  for (const auto& lit : dev_literals) {
1052  lit_buf_size = CgenState::addAligned(lit_buf_size, CgenState::literalBytes(lit));
1053  if (lit.which() == 7) {
1054  const auto p = boost::get<std::string>(&lit);
1055  CHECK(p);
1056  real_strings.push_back(*p);
1057  } else if (lit.which() == 8) {
1058  const auto p = boost::get<std::vector<double>>(&lit);
1059  CHECK(p);
1060  double_array_literals.push_back(*p);
1061  } else if (lit.which() == 9) {
1062  const auto p = boost::get<std::vector<int32_t>>(&lit);
1063  CHECK(p);
1064  int32_array_literals.push_back(*p);
1065  } else if (lit.which() == 10) {
1066  const auto p = boost::get<std::vector<int8_t>>(&lit);
1067  CHECK(p);
1068  int8_array_literals.push_back(*p);
1069  } else if (lit.which() == 11) {
1070  const auto p = boost::get<std::pair<std::vector<int8_t>, int>>(&lit);
1071  CHECK(p);
1072  if (p->second == 64) {
1073  align64_int8_array_literals.push_back(p->first);
1074  } else if (p->second == 32) {
1075  align32_int8_array_literals.push_back(p->first);
1076  } else {
1077  CHECK(false);
1078  }
1079  }
1080  }
1081  if (lit_buf_size > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
1082  throw TooManyLiterals();
1083  }
1084  int16_t crt_real_str_off = lit_buf_size;
1085  for (const auto& real_str : real_strings) {
1086  CHECK_LE(real_str.size(), static_cast<size_t>(std::numeric_limits<int16_t>::max()));
1087  lit_buf_size += real_str.size();
1088  }
1089  if (double_array_literals.size() > 0) {
1090  lit_buf_size = align(lit_buf_size, sizeof(double));
1091  }
1092  int16_t crt_double_arr_lit_off = lit_buf_size;
1093  for (const auto& double_array_literal : double_array_literals) {
1094  CHECK_LE(double_array_literal.size(),
1095  static_cast<size_t>(std::numeric_limits<int16_t>::max()));
1096  lit_buf_size += double_array_literal.size() * sizeof(double);
1097  }
1098  if (align64_int8_array_literals.size() > 0) {
1099  lit_buf_size = align(lit_buf_size, sizeof(uint64_t));
1100  }
1101  int16_t crt_align64_int8_arr_lit_off = lit_buf_size;
1102  for (const auto& align64_int8_array_literal : align64_int8_array_literals) {
1103  CHECK_LE(align64_int8_array_literals.size(),
1104  static_cast<size_t>(std::numeric_limits<int16_t>::max()));
1105  lit_buf_size += align64_int8_array_literal.size();
1106  }
1107  if (int32_array_literals.size() > 0) {
1108  lit_buf_size = align(lit_buf_size, sizeof(int32_t));
1109  }
1110  int16_t crt_int32_arr_lit_off = lit_buf_size;
1111  for (const auto& int32_array_literal : int32_array_literals) {
1112  CHECK_LE(int32_array_literal.size(),
1113  static_cast<size_t>(std::numeric_limits<int16_t>::max()));
1114  lit_buf_size += int32_array_literal.size() * sizeof(int32_t);
1115  }
1116  if (align32_int8_array_literals.size() > 0) {
1117  lit_buf_size = align(lit_buf_size, sizeof(int32_t));
1118  }
1119  int16_t crt_align32_int8_arr_lit_off = lit_buf_size;
1120  for (const auto& align32_int8_array_literal : align32_int8_array_literals) {
1121  CHECK_LE(align32_int8_array_literals.size(),
1122  static_cast<size_t>(std::numeric_limits<int16_t>::max()));
1123  lit_buf_size += align32_int8_array_literal.size();
1124  }
1125  int16_t crt_int8_arr_lit_off = lit_buf_size;
1126  for (const auto& int8_array_literal : int8_array_literals) {
1127  CHECK_LE(int8_array_literal.size(),
1128  static_cast<size_t>(std::numeric_limits<int16_t>::max()));
1129  lit_buf_size += int8_array_literal.size();
1130  }
1131  unsigned crt_real_str_idx = 0;
1132  unsigned crt_double_arr_lit_idx = 0;
1133  unsigned crt_align64_int8_arr_lit_idx = 0;
1134  unsigned crt_int32_arr_lit_idx = 0;
1135  unsigned crt_align32_int8_arr_lit_idx = 0;
1136  unsigned crt_int8_arr_lit_idx = 0;
1137  std::vector<int8_t> serialized(lit_buf_size);
1138  size_t off{0};
1139  for (const auto& lit : dev_literals) {
1140  const auto lit_bytes = CgenState::literalBytes(lit);
1141  off = CgenState::addAligned(off, lit_bytes);
1142  switch (lit.which()) {
1143  case 0: {
1144  const auto p = boost::get<int8_t>(&lit);
1145  CHECK(p);
1146  serialized[off - lit_bytes] = *p;
1147  break;
1148  }
1149  case 1: {
1150  const auto p = boost::get<int16_t>(&lit);
1151  CHECK(p);
1152  memcpy(&serialized[off - lit_bytes], p, lit_bytes);
1153  break;
1154  }
1155  case 2: {
1156  const auto p = boost::get<int32_t>(&lit);
1157  CHECK(p);
1158  memcpy(&serialized[off - lit_bytes], p, lit_bytes);
1159  break;
1160  }
1161  case 3: {
1162  const auto p = boost::get<int64_t>(&lit);
1163  CHECK(p);
1164  memcpy(&serialized[off - lit_bytes], p, lit_bytes);
1165  break;
1166  }
1167  case 4: {
1168  const auto p = boost::get<float>(&lit);
1169  CHECK(p);
1170  memcpy(&serialized[off - lit_bytes], p, lit_bytes);
1171  break;
1172  }
1173  case 5: {
1174  const auto p = boost::get<double>(&lit);
1175  CHECK(p);
1176  memcpy(&serialized[off - lit_bytes], p, lit_bytes);
1177  break;
1178  }
1179  case 6: {
1180  const auto p = boost::get<std::pair<std::string, shared::StringDictKey>>(&lit);
1181  CHECK(p);
1182  const auto str_id =
1184  ? getStringDictionaryProxy(p->second, row_set_mem_owner_, true)
1185  ->getOrAddTransient(p->first)
1186  : getStringDictionaryProxy(p->second, row_set_mem_owner_, true)
1187  ->getIdOfString(p->first);
1188  memcpy(&serialized[off - lit_bytes], &str_id, lit_bytes);
1189  break;
1190  }
1191  case 7: {
1192  const auto p = boost::get<std::string>(&lit);
1193  CHECK(p);
1194  int32_t off_and_len = crt_real_str_off << 16;
1195  const auto& crt_real_str = real_strings[crt_real_str_idx];
1196  off_and_len |= static_cast<int16_t>(crt_real_str.size());
1197  memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
1198  memcpy(&serialized[crt_real_str_off], crt_real_str.data(), crt_real_str.size());
1199  ++crt_real_str_idx;
1200  crt_real_str_off += crt_real_str.size();
1201  break;
1202  }
1203  case 8: {
1204  const auto p = boost::get<std::vector<double>>(&lit);
1205  CHECK(p);
1206  int32_t off_and_len = crt_double_arr_lit_off << 16;
1207  const auto& crt_double_arr_lit = double_array_literals[crt_double_arr_lit_idx];
1208  int32_t len = crt_double_arr_lit.size();
1209  CHECK_EQ((len >> 16), 0);
1210  off_and_len |= static_cast<int16_t>(len);
1211  int32_t double_array_bytesize = len * sizeof(double);
1212  memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
1213  memcpy(&serialized[crt_double_arr_lit_off],
1214  crt_double_arr_lit.data(),
1215  double_array_bytesize);
1216  ++crt_double_arr_lit_idx;
1217  crt_double_arr_lit_off += double_array_bytesize;
1218  break;
1219  }
1220  case 9: {
1221  const auto p = boost::get<std::vector<int32_t>>(&lit);
1222  CHECK(p);
1223  int32_t off_and_len = crt_int32_arr_lit_off << 16;
1224  const auto& crt_int32_arr_lit = int32_array_literals[crt_int32_arr_lit_idx];
1225  int32_t len = crt_int32_arr_lit.size();
1226  CHECK_EQ((len >> 16), 0);
1227  off_and_len |= static_cast<int16_t>(len);
1228  int32_t int32_array_bytesize = len * sizeof(int32_t);
1229  memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
1230  memcpy(&serialized[crt_int32_arr_lit_off],
1231  crt_int32_arr_lit.data(),
1232  int32_array_bytesize);
1233  ++crt_int32_arr_lit_idx;
1234  crt_int32_arr_lit_off += int32_array_bytesize;
1235  break;
1236  }
1237  case 10: {
1238  const auto p = boost::get<std::vector<int8_t>>(&lit);
1239  CHECK(p);
1240  int32_t off_and_len = crt_int8_arr_lit_off << 16;
1241  const auto& crt_int8_arr_lit = int8_array_literals[crt_int8_arr_lit_idx];
1242  int32_t len = crt_int8_arr_lit.size();
1243  CHECK_EQ((len >> 16), 0);
1244  off_and_len |= static_cast<int16_t>(len);
1245  int32_t int8_array_bytesize = len;
1246  memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
1247  memcpy(&serialized[crt_int8_arr_lit_off],
1248  crt_int8_arr_lit.data(),
1249  int8_array_bytesize);
1250  ++crt_int8_arr_lit_idx;
1251  crt_int8_arr_lit_off += int8_array_bytesize;
1252  break;
1253  }
1254  case 11: {
1255  const auto p = boost::get<std::pair<std::vector<int8_t>, int>>(&lit);
1256  CHECK(p);
1257  if (p->second == 64) {
1258  int32_t off_and_len = crt_align64_int8_arr_lit_off << 16;
1259  const auto& crt_align64_int8_arr_lit =
1260  align64_int8_array_literals[crt_align64_int8_arr_lit_idx];
1261  int32_t len = crt_align64_int8_arr_lit.size();
1262  CHECK_EQ((len >> 16), 0);
1263  off_and_len |= static_cast<int16_t>(len);
1264  int32_t align64_int8_array_bytesize = len;
1265  memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
1266  memcpy(&serialized[crt_align64_int8_arr_lit_off],
1267  crt_align64_int8_arr_lit.data(),
1268  align64_int8_array_bytesize);
1269  ++crt_align64_int8_arr_lit_idx;
1270  crt_align64_int8_arr_lit_off += align64_int8_array_bytesize;
1271  } else if (p->second == 32) {
1272  int32_t off_and_len = crt_align32_int8_arr_lit_off << 16;
1273  const auto& crt_align32_int8_arr_lit =
1274  align32_int8_array_literals[crt_align32_int8_arr_lit_idx];
1275  int32_t len = crt_align32_int8_arr_lit.size();
1276  CHECK_EQ((len >> 16), 0);
1277  off_and_len |= static_cast<int16_t>(len);
1278  int32_t align32_int8_array_bytesize = len;
1279  memcpy(&serialized[off - lit_bytes], &off_and_len, lit_bytes);
1280  memcpy(&serialized[crt_align32_int8_arr_lit_off],
1281  crt_align32_int8_arr_lit.data(),
1282  align32_int8_array_bytesize);
1283  ++crt_align32_int8_arr_lit_idx;
1284  crt_align32_int8_arr_lit_off += align32_int8_array_bytesize;
1285  } else {
1286  CHECK(false);
1287  }
1288  break;
1289  }
1290  default:
1291  CHECK(false);
1292  }
1293  }
1294  return serialized;
1295 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
StringDictionaryProxy * getStringDictionaryProxy(const shared::StringDictKey &dict_key, const bool with_generation) const
Definition: Execute.h:578
static size_t literalBytes(const CgenState::LiteralValue &lit)
Definition: CgenState.h:418
bool g_enable_string_functions
static size_t addAligned(const size_t off_in, const size_t alignment)
Definition: CgenState.h:449
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:1533
int32_t getOrAddTransient(const std::string &)
#define CHECK_LE(x, y)
Definition: Logger.h:304
#define CHECK(condition)
Definition: Logger.h:291
static size_t align(const size_t off_in, const size_t alignment)
Definition: Execute.h:1468
int32_t getIdOfString(const std::string &str) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Executor::set_concurrent_resource_grant_policy ( const ExecutorResourceMgr_Namespace::ConcurrentResourceGrantPolicy concurrent_resource_grant_policy)
static

Definition at line 5443 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

5445  {
5447  throw std::runtime_error(
5448  "ExecutorResourceMgr must be enabled to set executor concurrent resource grant "
5449  "policy.");
5450  }
5451  executor_resource_mgr_->set_concurrent_resource_grant_policy(
5452  concurrent_resource_grant_policy);
5453 }
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645
bool g_enable_executor_resource_mgr
Definition: Execute.cpp:174
void Executor::set_executor_resource_pool_resource ( const ExecutorResourceMgr_Namespace::ResourceType  resource_type,
const size_t  resource_quantity 
)
static

Definition at line 5422 of file Execute.cpp.

References executor_resource_mgr_, and g_enable_executor_resource_mgr.

5424  {
5426  throw std::runtime_error(
5427  "ExecutorResourceMgr must be enabled to set executor resource pool resource.");
5428  }
5429  executor_resource_mgr_->set_resource(resource_type, resource_quantity);
5430 }
static std::shared_ptr< ExecutorResourceMgr_Namespace::ExecutorResourceMgr > executor_resource_mgr_
Definition: Execute.h:1645
bool g_enable_executor_resource_mgr
Definition: Execute.cpp:174
void Executor::setBlockSize ( unsigned  block_size)

Definition at line 4350 of file Execute.cpp.

References block_size_x_.

4350  {
4351  block_size_x_ = block_size;
4352 }
unsigned block_size_x_
Definition: Execute.h:1552
void Executor::setColRangeCache ( const AggregatedColRange aggregated_col_range)
inline

Definition at line 1329 of file Execute.h.

References agg_col_range_cache_.

1329  {
1330  agg_col_range_cache_ = aggregated_col_range;
1331  }
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:1572
void Executor::setGridSize ( unsigned  grid_size)

Definition at line 4342 of file Execute.cpp.

References grid_size_x_.

4342  {
4343  grid_size_x_ = grid_size;
4344 }
unsigned grid_size_x_
Definition: Execute.h:1553
void Executor::setQuerySessionAsInterrupted ( const QuerySessionId query_session,
heavyai::unique_lock< heavyai::shared_mutex > &  write_lock 
)

Definition at line 5209 of file Execute.cpp.

References queries_interrupt_flag_.

5211  {
5212  if (query_session.empty()) {
5213  return;
5214  }
5215  if (queries_interrupt_flag_.find(query_session) != queries_interrupt_flag_.end()) {
5216  queries_interrupt_flag_[query_session] = true;
5217  }
5218 }
static InterruptFlagMap queries_interrupt_flag_
Definition: Execute.h:1578
void Executor::setupCaching ( const std::unordered_set< PhysicalInput > &  phys_inputs,
const std::unordered_set< shared::TableKey > &  phys_table_keys 
)

Definition at line 4926 of file Execute.cpp.

References agg_col_range_cache_, computeColRangesCache(), computeStringDictionaryGenerations(), computeTableGenerations(), cpu_threads(), executor_id_, getArenaBlockSize(), row_set_mem_owner_, and table_generations_.

4927  {
4928  row_set_mem_owner_ = std::make_shared<RowSetMemoryOwner>(
4930  row_set_mem_owner_->setDictionaryGenerations(
4931  computeStringDictionaryGenerations(phys_inputs));
4933  table_generations_ = computeTableGenerations(phys_table_ids);
4934 }
AggregatedColRange computeColRangesCache(const std::unordered_set< PhysicalInput > &phys_inputs)
Definition: Execute.cpp:4860
AggregatedColRange agg_col_range_cache_
Definition: Execute.h:1572
const ExecutorId executor_id_
Definition: Execute.h:1476
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: Execute.h:1533
TableGenerations computeTableGenerations(const std::unordered_set< shared::TableKey > &phys_table_keys)
Definition: Execute.cpp:4914
StringDictionaryGenerations computeStringDictionaryGenerations(const std::unordered_set< PhysicalInput > &phys_inputs)
Definition: Execute.cpp:4888
TableGenerations table_generations_
Definition: Execute.h:1573
int cpu_threads()
Definition: thread_count.h:25
static size_t getArenaBlockSize()
Definition: Execute.cpp:558

+ Here is the call graph for this function:

std::pair< bool, int64_t > Executor::skipFragment ( const InputDescriptor table_desc,
const Fragmenter_Namespace::FragmentInfo frag_info,
const std::list< std::shared_ptr< Analyzer::Expr >> &  simple_quals,
const std::vector< uint64_t > &  frag_offsets,
const size_t  frag_idx 
)
private

Definition at line 4624 of file Execute.cpp.

References canSkipFragmentForFpQual(), CHECK, CodeGenerator::codegenIntConst(), DateTruncateHighPrecisionToDate(), extract_max_stat_int_type(), extract_min_stat_int_type(), get_column_descriptor(), anonymous_namespace{Execute.cpp}::get_hpt_overflow_underflow_safe_scaled_values(), Analyzer::BinOper::get_left_operand(), Fragmenter_Namespace::FragmentInfo::getChunkMetadataMap(), getTableGeneration(), InputDescriptor::getTableKey(), INVALID, isFragmentFullyDeleted(), kCAST, kEQ, kGE, kGT, kLE, kLT, kTIME, NOT_SKIPPABLE, Fragmenter_Namespace::FragmentInfo::physicalTableId, SKIPPABLE, to_string(), UNREACHABLE, and VLOG.

Referenced by skipFragmentInnerJoins().

4629  {
4630  // First check to see if all of fragment is deleted, in which case we know we can skip
4631  if (isFragmentFullyDeleted(table_desc, fragment)) {
4632  VLOG(2) << "Skipping deleted fragment with table id: " << fragment.physicalTableId
4633  << ", fragment id: " << frag_idx;
4634  return {true, -1};
4635  }
4636 
4637  for (const auto& simple_qual : simple_quals) {
4638  const auto comp_expr =
4639  std::dynamic_pointer_cast<const Analyzer::BinOper>(simple_qual);
4640  if (!comp_expr) {
4641  // is this possible?
4642  return {false, -1};
4643  }
4644  const auto lhs = comp_expr->get_left_operand();
4645  auto lhs_col = dynamic_cast<const Analyzer::ColumnVar*>(lhs);
4646  if (!lhs_col || !lhs_col->getColumnKey().table_id || lhs_col->get_rte_idx()) {
4647  // See if lhs is a simple cast that was allowed through normalize_simple_predicate
4648  auto lhs_uexpr = dynamic_cast<const Analyzer::UOper*>(lhs);
4649  if (lhs_uexpr) {
4650  CHECK(lhs_uexpr->get_optype() ==
4651  kCAST); // We should have only been passed a cast expression
4652  lhs_col = dynamic_cast<const Analyzer::ColumnVar*>(lhs_uexpr->get_operand());
4653  if (!lhs_col || !lhs_col->getColumnKey().table_id || lhs_col->get_rte_idx()) {
4654  continue;
4655  }
4656  } else {
4657  continue;
4658  }
4659  }
4660  const auto rhs = comp_expr->get_right_operand();
4661  const auto rhs_const = dynamic_cast<const Analyzer::Constant*>(rhs);
4662  if (!rhs_const) {
4663  // is this possible?
4664  return {false, -1};
4665  }
4666  if (!lhs->get_type_info().is_integer() && !lhs->get_type_info().is_time() &&
4667  !lhs->get_type_info().is_fp()) {
4668  continue;
4669  }
4670  if (lhs->get_type_info().is_fp()) {
4671  const auto fragment_skip_status =
4672  canSkipFragmentForFpQual(comp_expr.get(), lhs_col, fragment, rhs_const);
4673  switch (fragment_skip_status) {
4675  return {true, -1};
4677  return {false, -1};
4679  continue;
4680  default:
4681  UNREACHABLE();
4682  }
4683  }
4684 
4685  // Everything below is logic for integer and integer-backed timestamps
4686  // TODO: Factor out into separate function per canSkipFragmentForFpQual above
4687 
4688  if (lhs_col->get_type_info().is_timestamp() &&
4689  rhs_const->get_type_info().is_any<kTIME>()) {
4690  // when casting from a timestamp to time
4691  // is not possible to get a valid range
4692  // so we can't skip any fragment
4693  continue;
4694  }
4695 
4696  const int col_id = lhs_col->getColumnKey().column_id;
4697  auto chunk_meta_it = fragment.getChunkMetadataMap().find(col_id);
4698  int64_t chunk_min{0};
4699  int64_t chunk_max{0};
4700  bool is_rowid{false};
4701  size_t start_rowid{0};
4702  const auto& table_key = table_desc.getTableKey();
4703  if (chunk_meta_it == fragment.getChunkMetadataMap().end()) {
4704  auto cd = get_column_descriptor({table_key, col_id});
4705  if (cd->isVirtualCol) {
4706  CHECK(cd->columnName == "rowid");
4707  const auto& table_generation = getTableGeneration(table_key);
4708  start_rowid = table_generation.start_rowid;
4709  chunk_min = frag_offsets[frag_idx] + start_rowid;
4710  chunk_max = frag_offsets[frag_idx + 1] - 1 + start_rowid;
4711  is_rowid = true;
4712  }
4713  } else {
4714  const auto& chunk_type = lhs_col->get_type_info();
4715  chunk_min =
4716  extract_min_stat_int_type(chunk_meta_it->second->chunkStats, chunk_type);
4717  chunk_max =
4718  extract_max_stat_int_type(chunk_meta_it->second->chunkStats, chunk_type);
4719  }
4720  if (chunk_min > chunk_max) {
4721  // invalid metadata range, do not skip fragment
4722  return {false, -1};
4723  }
4724  if (lhs->get_type_info().is_timestamp() &&
4725  (lhs_col->get_type_info().get_dimension() !=
4726  rhs_const->get_type_info().get_dimension()) &&
4727  (lhs_col->get_type_info().is_high_precision_timestamp() ||
4728  rhs_const->get_type_info().is_high_precision_timestamp())) {
4729  // If original timestamp lhs col has different precision,
4730  // column metadata holds value in original precision
4731  // therefore adjust rhs value to match lhs precision
4732 
4733  // Note(Wamsi): We adjust rhs const value instead of lhs value to not
4734  // artificially limit the lhs column range. RHS overflow/underflow is already
4735  // been validated in `TimeGM::get_overflow_underflow_safe_epoch`.
4736  bool is_valid;
4737  std::tie(is_valid, chunk_min, chunk_max) =
4739  chunk_min, chunk_max, lhs_col->get_type_info(), rhs_const->get_type_info());
4740  if (!is_valid) {
4741  VLOG(4) << "Overflow/Underflow detecting in fragments skipping logic.\nChunk min "
4742  "value: "
4743  << std::to_string(chunk_min)
4744  << "\nChunk max value: " << std::to_string(chunk_max)
4745  << "\nLHS col precision is: "
4746  << std::to_string(lhs_col->get_type_info().get_dimension())
4747  << "\nRHS precision is: "
4748  << std::to_string(rhs_const->get_type_info().get_dimension()) << ".";
4749  return {false, -1};
4750  }
4751  }
4752  if (lhs_col->get_type_info().is_timestamp() && rhs_const->get_type_info().is_date()) {
4753  // It is obvious that a cast from timestamp to date is happening here,
4754  // so we have to correct the chunk min and max values to lower the precision as of
4755  // the date
4756  chunk_min = DateTruncateHighPrecisionToDate(
4757  chunk_min, pow(10, lhs_col->get_type_info().get_dimension()));
4758  chunk_max = DateTruncateHighPrecisionToDate(
4759  chunk_max, pow(10, lhs_col->get_type_info().get_dimension()));
4760  }
4761  llvm::LLVMContext local_context;
4762  CgenState local_cgen_state(local_context);
4763  CodeGenerator code_generator(&local_cgen_state, nullptr);
4764 
4765  const auto rhs_val =
4766  CodeGenerator::codegenIntConst(rhs_const, &local_cgen_state)->getSExtValue();
4767 
4768  switch (comp_expr->get_optype()) {
4769  case kGE:
4770  if (chunk_max < rhs_val) {
4771  return {true, -1};
4772  }
4773  break;
4774  case kGT:
4775  if (chunk_max <= rhs_val) {
4776  return {true, -1};
4777  }
4778  break;
4779  case kLE:
4780  if (chunk_min > rhs_val) {
4781  return {true, -1};
4782  }
4783  break;
4784  case kLT:
4785  if (chunk_min >= rhs_val) {
4786  return {true, -1};
4787  }
4788  break;
4789  case kEQ:
4790  if (chunk_min > rhs_val || chunk_max < rhs_val) {
4791  return {true, -1};
4792  } else if (is_rowid) {
4793  return {false, rhs_val - start_rowid};
4794  }
4795  break;
4796  default:
4797  break;
4798  }
4799  }
4800  return {false, -1};
4801 }
Definition: sqltypes.h:76
std::tuple< bool, int64_t, int64_t > get_hpt_overflow_underflow_safe_scaled_values(const int64_t chunk_min, const int64_t chunk_max, const SQLTypeInfo &lhs_type, const SQLTypeInfo &rhs_type)
Definition: Execute.cpp:4492
Definition: sqldefs.h:34
Definition: sqldefs.h:35
#define UNREACHABLE()
Definition: Logger.h:338
Definition: sqldefs.h:48
Definition: sqldefs.h:29
FragmentSkipStatus canSkipFragmentForFpQual(const Analyzer::BinOper *comp_expr, const Analyzer::ColumnVar *lhs_col, const Fragmenter_Namespace::FragmentInfo &fragment, const Analyzer::Constant *rhs_const) const
Definition: Execute.cpp:4564
int64_t extract_max_stat_int_type(const ChunkStats &stats, const SQLTypeInfo &ti)
std::string to_string(char const *&&v)
int64_t extract_min_stat_int_type(const ChunkStats &stats, const SQLTypeInfo &ti)
const ColumnDescriptor * get_column_descriptor(const shared::ColumnKey &column_key)
Definition: Execute.h:213
const shared::TableKey & getTableKey() const
Definition: sqldefs.h:33
static llvm::ConstantInt * codegenIntConst(const Analyzer::Constant *constant, CgenState *cgen_state)
Definition: ConstantIR.cpp:89
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int64_t DateTruncateHighPrecisionToDate(const int64_t timeval, const int64_t scale)
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqldefs.h:32
const Expr * get_left_operand() const
Definition: Analyzer.h:455
const TableGeneration & getTableGeneration(const shared::TableKey &table_key) const
Definition: Execute.cpp:716
#define VLOG(n)
Definition: Logger.h:388
bool isFragmentFullyDeleted(const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &fragment)
Definition: Execute.cpp:4527

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::pair< bool, int64_t > Executor::skipFragmentInnerJoins ( const InputDescriptor table_desc,
const RelAlgExecutionUnit ra_exe_unit,
const Fragmenter_Namespace::FragmentInfo fragment,
const std::vector< uint64_t > &  frag_offsets,
const size_t  frag_idx 
)
private

Definition at line 4827 of file Execute.cpp.

References INNER, RelAlgExecutionUnit::join_quals, qual_to_conjunctive_form(), and skipFragment().

4832  {
4833  std::pair<bool, int64_t> skip_frag{false, -1};
4834  for (auto& inner_join : ra_exe_unit.join_quals) {
4835  if (inner_join.type != JoinType::INNER) {
4836  continue;
4837  }
4838 
4839  // extracting all the conjunctive simple_quals from the quals stored for the inner
4840  // join
4841  std::list<std::shared_ptr<Analyzer::Expr>> inner_join_simple_quals;
4842  for (auto& qual : inner_join.quals) {
4843  auto temp_qual = qual_to_conjunctive_form(qual);
4844  inner_join_simple_quals.insert(inner_join_simple_quals.begin(),
4845  temp_qual.simple_quals.begin(),
4846  temp_qual.simple_quals.end());
4847  }
4848  auto temp_skip_frag = skipFragment(
4849  table_desc, fragment, inner_join_simple_quals, frag_offsets, frag_idx);
4850  if (temp_skip_frag.second != -1) {
4851  skip_frag.second = temp_skip_frag.second;
4852  return skip_frag;
4853  } else {
4854  skip_frag.first = skip_frag.first || temp_skip_frag.first;
4855  }
4856  }
4857  return skip_frag;
4858 }
QualsConjunctiveForm qual_to_conjunctive_form(const std::shared_ptr< Analyzer::Expr > qual_expr)
const JoinQualsPerNestingLevel join_quals
std::pair< bool, int64_t > skipFragment(const InputDescriptor &table_desc, const Fragmenter_Namespace::FragmentInfo &frag_info, const std::list< std::shared_ptr< Analyzer::Expr >> &simple_quals, const std::vector< uint64_t > &frag_offsets, const size_t frag_idx)
Definition: Execute.cpp:4624

+ Here is the call graph for this function:

bool Executor::skipFragmentPair ( const Fragmenter_Namespace::FragmentInfo outer_fragment_info,
const Fragmenter_Namespace::FragmentInfo inner_fragment_info,
const int  inner_table_id,
const std::unordered_map< shared::TableKey, const Analyzer::BinOper * > &  inner_table_id_to_join_condition,
const RelAlgExecutionUnit ra_exe_unit,
const ExecutorDeviceType  device_type 
)
private

Definition at line 3246 of file Execute.cpp.

References CHECK, CHECK_EQ, get_shard_count(), BaselineJoinHashTable::getShardCountForCondition(), getTemporaryTables(), GPU, RelAlgExecutionUnit::input_descs, RelAlgExecutionUnit::join_quals, HashJoin::normalizeColumnPairs(), plan_state_, and Fragmenter_Namespace::FragmentInfo::shard.

Referenced by getTableFragmentIndices().

3253  {
3254  if (device_type != ExecutorDeviceType::GPU) {
3255  return false;
3256  }
3257  CHECK(table_idx >= 0 &&
3258  static_cast<size_t>(table_idx) < ra_exe_unit.input_descs.size());
3259  const auto& inner_table_key = ra_exe_unit.input_descs[table_idx].getTableKey();
3260  // Both tables need to be sharded the same way.
3261  if (outer_fragment_info.shard == -1 || inner_fragment_info.shard == -1 ||
3262  outer_fragment_info.shard == inner_fragment_info.shard) {
3263  return false;
3264  }
3265  const Analyzer::BinOper* join_condition{nullptr};
3266  if (ra_exe_unit.join_quals.empty()) {
3267  CHECK(!inner_table_id_to_join_condition.empty());
3268  auto condition_it = inner_table_id_to_join_condition.find(inner_table_key);
3269  CHECK(condition_it != inner_table_id_to_join_condition.end());
3270  join_condition = condition_it->second;
3271  CHECK(join_condition);
3272  } else {
3273  CHECK_EQ(plan_state_->join_info_.equi_join_tautologies_.size(),
3274  plan_state_->join_info_.join_hash_tables_.size());
3275  for (size_t i = 0; i < plan_state_->join_info_.join_hash_tables_.size(); ++i) {
3276  if (plan_state_->join_info_.join_hash_tables_[i]->getInnerTableRteIdx() ==
3277  table_idx) {
3278  CHECK(!join_condition);
3279  join_condition = plan_state_->join_info_.equi_join_tautologies_[i].get();
3280  }
3281  }
3282  }
3283  if (!join_condition) {
3284  return false;
3285  }
3286  // TODO(adb): support fragment skipping based on the bounding box intersect operator
3287  if (join_condition->is_bbox_intersect_oper()) {
3288  return false;
3289  }
3290  size_t shard_count{0};
3291  if (dynamic_cast<const Analyzer::ExpressionTuple*>(
3292  join_condition->get_left_operand())) {
3293  auto inner_outer_pairs =
3294  HashJoin::normalizeColumnPairs(join_condition, getTemporaryTables()).first;
3296  join_condition, this, inner_outer_pairs);
3297  } else {
3298  shard_count = get_shard_count(join_condition, this);
3299  }
3300  if (shard_count && !ra_exe_unit.join_quals.empty()) {
3301  plan_state_->join_info_.sharded_range_table_indices_.emplace(table_idx);
3302  }
3303  return shard_count;
3304 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::vector< InputDescriptor > input_descs
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< PlanState > plan_state_
Definition: Execute.h:1532
const TemporaryTables * getTemporaryTables()
Definition: Execute.h:573
#define CHECK(condition)
Definition: Logger.h:291
static std::pair< std::vector< InnerOuter >, std::vector< InnerOuterStringOpInfos > > normalizeColumnPairs(const Analyzer::BinOper *condition, const TemporaryTables *temporary_tables)
Definition: HashJoin.cpp:1015
static size_t getShardCountForCondition(const Analyzer::BinOper *condition, const Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)
size_t get_shard_count(const Analyzer::BinOper *join_condition, const Executor *executor)
Definition: HashJoin.cpp:1084

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * Executor::spillDoubleElement ( llvm::Value *  elem_val,
llvm::Type elem_ty 
)
private

Definition at line 19 of file MaxwellCodegenPatch.cpp.

19  {
20  auto var_ptr = cgen_state_->ir_builder_.CreateAlloca(elem_ty);
21  cgen_state_->ir_builder_.CreateStore(elem_val, var_ptr);
22  return var_ptr;
23 }
std::unique_ptr< CgenState > cgen_state_
Definition: Execute.h:1502
void Executor::unregisterActiveModule ( const int  device_id)
static

Definition at line 30 of file GpuInterrupt.cpp.

References CHECK_LT, to_string(), and VLOG.

Referenced by resetInterrupt().

30  {
31 #ifdef HAVE_CUDA
32  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
33  CHECK_LT(device_id, max_gpu_count);
34  if ((gpu_active_modules_device_mask_ & (1 << device_id)) == 0) {
35  return;
36  }
37  gpu_active_modules_device_mask_ ^= (1 << device_id);
38  VLOG(1) << "Unregistered module on device " << std::to_string(device_id);
39 #endif
40 }
static const int max_gpu_count
Definition: Execute.h:1535
static uint32_t gpu_active_modules_device_mask_
Definition: Execute.h:1540
std::string to_string(char const *&&v)
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define VLOG(n)
Definition: Logger.h:388
static std::mutex gpu_active_modules_mutex_
Definition: Execute.h:1539

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

static void Executor::update_after_registration ( bool  update_runtime_modules_only = false)
inlinestatic

Definition at line 1420 of file Execute.h.

References executors_.

Referenced by registerExtensionFunctions().

1420  {
1421  for (auto executor_item : Executor::executors_) {
1422  executor_item.second->update_extension_modules(update_runtime_modules_only);
1423  }
1424  }
static std::map< int, std::shared_ptr< Executor > > executors_
Definition: Execute.h:1581

+ Here is the caller graph for this function:

void Executor::update_extension_modules ( bool  update_runtime_modules_only = false)

is_gpu=

is_gpu=

is_gpu=

is_gpu=

Definition at line 346 of file Execute.cpp.

References CHECK, extension_module_sources, LOG, read_llvm_module_from_bc_file(), read_llvm_module_from_ir_file(), read_llvm_module_from_ir_string(), rt_geos_module, rt_libdevice_module, rt_udf_cpu_module, rt_udf_gpu_module, template_module, toString(), udf_cpu_module, udf_gpu_module, UNREACHABLE, and logger::WARNING.

346  {
347  auto read_module = [&](Executor::ExtModuleKinds module_kind,
348  const std::string& source) {
349  /*
350  source can be either a filename of a LLVM IR
351  or LLVM BC source, or a string containing
352  LLVM IR code.
353  */
354  CHECK(!source.empty());
355  switch (module_kind) {
359  return read_llvm_module_from_bc_file(source, getContext());
360  }
362  return read_llvm_module_from_ir_file(source, getContext(), false);
363  }
365  return read_llvm_module_from_ir_file(source, getContext(), true);
366  }
368  return read_llvm_module_from_ir_string(source, getContext(), false);
369  }
371  return read_llvm_module_from_ir_string(source, getContext(), true);
372  }
373  default: {
374  UNREACHABLE();
375  return std::unique_ptr<llvm::Module>();
376  }
377  }
378  };
379  auto update_module = [&](Executor::ExtModuleKinds module_kind,
380  bool erase_not_found = false) {
381  auto it = Executor::extension_module_sources.find(module_kind);
382  if (it != Executor::extension_module_sources.end()) {
383  auto llvm_module = read_module(module_kind, it->second);
384  if (llvm_module) {
385  extension_modules_[module_kind] = std::move(llvm_module);
386  } else if (erase_not_found) {
387  extension_modules_.erase(module_kind);
388  } else {
389  if (extension_modules_.find(module_kind) == extension_modules_.end()) {
390  LOG(WARNING) << "Failed to update " << ::toString(module_kind)
391  << " LLVM module. The module will be unavailable.";
392  } else {
393  LOG(WARNING) << "Failed to update " << ::toString(module_kind)
394  << " LLVM module. Using the existing module.";
395  }
396  }
397  } else {
398  if (erase_not_found) {
399  extension_modules_.erase(module_kind);
400  } else {
401  if (extension_modules_.find(module_kind) == extension_modules_.end()) {
402  LOG(WARNING) << "Source of " << ::toString(module_kind)
403  << " LLVM module is unavailable. The module will be unavailable.";
404  } else {
405  LOG(WARNING) << "Source of " << ::toString(module_kind)
406  << " LLVM module is unavailable. Using the existing module.";
407  }
408  }
409  }
410  };
411 
412  if (!update_runtime_modules_only) {
413  // required compile-time modules, their requirements are enforced
414  // by Executor::initialize_extension_module_sources():
416 #ifdef ENABLE_GEOS
418 #endif
419  // load-time modules, these are optional:
420  update_module(Executor::ExtModuleKinds::udf_cpu_module, true);
421 #ifdef HAVE_CUDA
422  update_module(Executor::ExtModuleKinds::udf_gpu_module, true);
424 #endif
425  }
426  // run-time modules, these are optional and erasable:
427  update_module(Executor::ExtModuleKinds::rt_udf_cpu_module, true);
428 #ifdef HAVE_CUDA
429  update_module(Executor::ExtModuleKinds::rt_udf_gpu_module, true);
430 #endif
431 }
ExtModuleKinds
Definition: Execute.h:518
#define LOG(tag)
Definition: Logger.h:285
#define UNREACHABLE()
Definition: Logger.h:338
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::string toString(const QueryDescriptionType &type)
Definition: Types.h:64
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
static std::map< ExtModuleKinds, std::string > extension_module_sources
Definition: Execute.h:528
#define CHECK(condition)
Definition: Logger.h:291
llvm::LLVMContext & getContext()
Definition: Execute.h:1417
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
std::map< ExtModuleKinds, std::unique_ptr< llvm::Module > > extension_modules_
Definition: Execute.h:1517

+ Here is the call graph for this function:

bool Executor::updateQuerySessionExecutorAssignment ( const QuerySessionId query_session,
const std::string &  submitted_time_str,
const size_t  executor_id,
heavyai::unique_lock< heavyai::shared_mutex > &  write_lock 
)

Definition at line 5150 of file Execute.cpp.

References queries_session_map_.

Referenced by attachExecutorToQuerySession().

5154  {
5155  // update the executor id of the query session
5156  if (query_session.empty()) {
5157  return false;
5158  }
5159  if (queries_session_map_.count(query_session)) {
5160  auto storage = queries_session_map_.at(query_session);
5161  for (auto it = storage.begin(); it != storage.end(); it++) {
5162  auto target_submitted_t_str = it->second.getQuerySubmittedTime();
5163  // no time difference --> found the target query status
5164  if (submitted_time_str.compare(target_submitted_t_str) == 0) {
5165  queries_session_map_.at(query_session)
5166  .at(submitted_time_str)
5167  .setExecutorId(executor_id);
5168  return true;
5169  }
5170  }
5171  }
5172  return false;
5173 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580

+ Here is the caller graph for this function:

void Executor::updateQuerySessionStatus ( const QuerySessionId query_session,
const std::string &  submitted_time_str,
const QuerySessionStatus::QueryStatus  new_query_status 
)

Definition at line 5041 of file Execute.cpp.

References current_query_session_, executor_session_mutex_, and updateQuerySessionStatusWithLock().

Referenced by executeWorkUnitImpl().

5044  {
5045  // update the running query session's the current status
5047  if (query_session.empty()) {
5048  return;
5049  }
5050  if (new_query_status == QuerySessionStatus::QueryStatus::RUNNING_QUERY_KERNEL) {
5051  current_query_session_ = query_session;
5052  }
5054  query_session, submitted_time_str, new_query_status, session_write_lock);
5055 }
QuerySessionId current_query_session_
Definition: Execute.h:1576
bool updateQuerySessionStatusWithLock(const QuerySessionId &query_session, const std::string &submitted_time_str, const QuerySessionStatus::QueryStatus updated_query_status, heavyai::unique_lock< heavyai::shared_mutex > &write_lock)
Definition: Execute.cpp:5124
std::unique_lock< T > unique_lock
static heavyai::shared_mutex executor_session_mutex_
Definition: Execute.h:1574

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool Executor::updateQuerySessionStatusWithLock ( const QuerySessionId query_session,
const std::string &  submitted_time_str,
const QuerySessionStatus::QueryStatus  updated_query_status,
heavyai::unique_lock< heavyai::shared_mutex > &  write_lock 
)

Definition at line 5124 of file Execute.cpp.

References queries_session_map_.

Referenced by attachExecutorToQuerySession(), and updateQuerySessionStatus().

5128  {
5129  // an internal API that updates query session status
5130  if (query_session.empty()) {
5131  return false;
5132  }
5133  if (queries_session_map_.count(query_session)) {
5134  for (auto& query_status : queries_session_map_.at(query_session)) {
5135  auto target_submitted_t_str = query_status.second.getQuerySubmittedTime();
5136  // no time difference --> found the target query status
5137  if (submitted_time_str.compare(target_submitted_t_str) == 0) {
5138  auto prev_status = query_status.second.getQueryStatus();
5139  if (prev_status == updated_query_status) {
5140  return false;
5141  }
5142  query_status.second.setQueryStatus(updated_query_status);
5143  return true;
5144  }
5145  }
5146  }
5147  return false;
5148 }
static QuerySessionMap queries_session_map_
Definition: Execute.h:1580

+ Here is the caller graph for this function:

int8_t Executor::warpSize ( ) const

Definition at line 4310 of file Execute.cpp.

References CHECK, cudaMgr(), and CudaMgr_Namespace::CudaMgr::getAllDeviceProperties().

4310  {
4311  const auto& dev_props = cudaMgr()->getAllDeviceProperties();
4312  CHECK(!dev_props.empty());
4313  return dev_props.front().warpSize;
4314 }
CudaMgr_Namespace::CudaMgr * cudaMgr() const
Definition: Execute.h:865
#define CHECK(condition)
Definition: Logger.h:291
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:134

+ Here is the call graph for this function:

Friends And Related Function Documentation

friend class BaselineJoinHashTable
friend

Definition at line 1647 of file Execute.h.

friend class BoundingBoxIntersectJoinHashTable
friend

Definition at line 1654 of file Execute.h.

friend class CodeGenerator
friend

Definition at line 1648 of file Execute.h.

friend class ColumnFetcher
friend

Definition at line 1649 of file Execute.h.

friend struct DiamondCodegen
friend

Definition at line 1650 of file Execute.h.

friend class ExecutionKernel
friend

Definition at line 1651 of file Execute.h.

friend class GroupByAndAggregate
friend

Definition at line 1656 of file Execute.h.

friend class HashJoin
friend

Definition at line 1653 of file Execute.h.

friend class InValuesBitmap
friend

Definition at line 1663 of file Execute.h.

friend class KernelSubtask
friend

Definition at line 1652 of file Execute.h.

friend class LeafAggregator
friend

Definition at line 1665 of file Execute.h.

friend class PendingExecutionClosure
friend

Definition at line 1668 of file Execute.h.

friend class PerfectJoinHashTable
friend

Definition at line 1666 of file Execute.h.

friend class QueryCompilationDescriptor
friend

Definition at line 1657 of file Execute.h.

friend class QueryExecutionContext
friend

Definition at line 1661 of file Execute.h.

friend class QueryFragmentDescriptor
friend

Definition at line 1660 of file Execute.h.

friend class QueryMemoryDescriptor
friend

Definition at line 1658 of file Execute.h.

Referenced by executeWorkUnitImpl(), reduceMultiDeviceResults(), and resultsUnion().

friend class QueryMemoryInitializer
friend

Definition at line 1659 of file Execute.h.

friend class QueryRewriter
friend

Definition at line 1667 of file Execute.h.

friend class RangeJoinHashTable
friend

Definition at line 1655 of file Execute.h.

friend class RelAlgExecutor
friend

Definition at line 1669 of file Execute.h.

friend class ResultSet
friend

Definition at line 1662 of file Execute.h.

friend class StringDictionaryTranslationMgr
friend

Definition at line 1664 of file Execute.h.

friend class TableFunctionCompilationContext
friend

Definition at line 1671 of file Execute.h.

friend class TableFunctionExecutionContext
friend

Definition at line 1672 of file Execute.h.

friend class TableOptimizer
friend

Definition at line 1670 of file Execute.h.

friend struct TargetExprCodegen
friend

Definition at line 1674 of file Execute.h.

friend struct TargetExprCodegenBuilder
friend

Definition at line 1673 of file Execute.h.

friend class WindowProjectNodeContext
friend

Definition at line 1675 of file Execute.h.

Member Data Documentation

WindowFunctionContext* Executor::active_window_function_ {nullptr}
private

Definition at line 1569 of file Execute.h.

AggregatedColRange Executor::agg_col_range_cache_
private
const size_t Executor::auto_cpu_mem_bytes {size_t(0)}
static

Definition at line 1643 of file Execute.h.

Referenced by DBHandler::init_executor_resource_mgr().

const size_t Executor::auto_num_threads {size_t(0)}
staticprivate

Definition at line 1536 of file Execute.h.

Referenced by launchKernelsImpl(), and launchKernelsLocked().

const size_t Executor::baseline_threshold
staticprivate
Initial value:
{
1000000}

Definition at line 1549 of file Execute.h.

Referenced by getBaselineThreshold(), and ResultSet::sort().

unsigned Executor::block_size_x_
private

Definition at line 1552 of file Execute.h.

Referenced by blockSize(), resetBlockSize(), and setBlockSize().

std::unordered_map< CardinalityCacheKey, size_t > Executor::cardinality_cache_
staticprivate
std::mutex Executor::compilation_mutex_

Definition at line 1635 of file Execute.h.

int64_t Executor::compilation_queue_time_ms_ = 0
private

Definition at line 1563 of file Execute.h.

Referenced by executeWorkUnit(), and nukeOldState().

std::unique_ptr<llvm::LLVMContext> Executor::context_
private

Definition at line 1477 of file Execute.h.

Referenced by getContext().

Data_Namespace::DataMgr* Executor::data_mgr_
private
const std::string Executor::debug_dir_
private

Definition at line 1555 of file Execute.h.

const std::string Executor::debug_file_
private

Definition at line 1556 of file Execute.h.

const int32_t Executor::ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED {11}
static
const int32_t Executor::ERR_OUT_OF_CPU_MEM {6}
static
const int32_t Executor::ERR_OUT_OF_RENDER_MEM {5}
static
const int32_t Executor::ERR_OUT_OF_SLOTS {3}
static

Definition at line 1617 of file Execute.h.

Referenced by executeWorkUnitImpl().

const int32_t Executor::ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY {14}
static
const int32_t Executor::ERR_STRING_CONST_IN_RESULTSET {13}
static
const int32_t Executor::ERR_TOO_MANY_LITERALS {12}
static
const int32_t Executor::ERR_UNSUPPORTED_SELF_JOIN {4}
static
const int32_t Executor::ERR_WIDTH_BUCKET_INVALID_ARGUMENT {17}
static
heavyai::shared_mutex Executor::execute_mutex_
staticprivate
std::map< int, std::shared_ptr< Executor > > Executor::executors_
staticprivate
heavyai::shared_mutex Executor::executors_cache_mutex_
staticprivate

Definition at line 1602 of file Execute.h.

Referenced by getExecutor(), nukeCacheOfExecutors(), and registerExtensionFunctions().

std::map< Executor::ExtModuleKinds, std::string > Executor::extension_module_sources
static
std::map<ExtModuleKinds, std::unique_ptr<llvm::Module> > Executor::extension_modules_
private

Definition at line 1517 of file Execute.h.

Referenced by get_extension_module(), and has_extension_module().

void * Executor::gpu_active_modules_
staticprivate

Definition at line 1541 of file Execute.h.

uint32_t Executor::gpu_active_modules_device_mask_ {0x0}
staticprivate

Definition at line 1540 of file Execute.h.

std::mutex Executor::gpu_active_modules_mutex_
staticprivate

Definition at line 1539 of file Execute.h.

std::mutex Executor::gpu_exec_mutex_[max_gpu_count]
private

Definition at line 1537 of file Execute.h.

unsigned Executor::grid_size_x_
private

Definition at line 1553 of file Execute.h.

Referenced by gridSize(), numBlocksPerMP(), resetGridSize(), and setGridSize().

InputTableInfoCache Executor::input_table_info_cache_
mutableprivate

Definition at line 1571 of file Execute.h.

Referenced by clearMetaInfoCache(), and getTableInfo().

std::atomic<bool> Executor::interrupted_ {false}
private
constexpr ExecutorId Executor::INVALID_EXECUTOR_ID = SIZE_MAX
static

Definition at line 424 of file Execute.h.

Referenced by CgenState::getExecutor().

std::mutex Executor::kernel_mutex_
static

Definition at line 1641 of file Execute.h.

Referenced by executeWorkUnitPerFragment(), and launchKernelsLocked().

int64_t Executor::kernel_queue_time_ms_ = 0
private
QueryPlanDAG Executor::latest_query_plan_extracted_ {EMPTY_QUERY_PLAN}
staticprivate

Definition at line 1612 of file Execute.h.

Referenced by getLatestQueryPlanDagExtracted(), and registerExtractedQueryPlanDag().

int const Executor::max_gpu_count
staticprivate

Definition at line 1535 of file Execute.h.

Referenced by ExecutionKernel::runImpl().

const size_t Executor::max_gpu_slab_size_
private

Definition at line 1554 of file Execute.h.

Referenced by maxGpuSlabSize().

std::unique_ptr<llvm::TargetMachine> Executor::nvptx_target_machine_
mutableprivate

Definition at line 1547 of file Execute.h.

QueryPlanDagCache Executor::query_plan_dag_cache_
staticprivate

Definition at line 1604 of file Execute.h.

Referenced by getQueryPlanDagCache().

std::mutex Executor::register_runtime_extension_functions_mutex_
static

Definition at line 1640 of file Execute.h.

Referenced by get_rt_udf_module(), and registerExtensionFunctions().

ResultSetRecyclerHolder Executor::resultset_recycler_holder_
staticprivate

Definition at line 1608 of file Execute.h.

Referenced by getResultSetRecyclerHolder().

std::shared_ptr<RowSetMemoryOwner> Executor::row_set_mem_owner_
private
std::mutex Executor::str_dict_mutex_
mutableprivate
TableGenerations Executor::table_generations_
private

Definition at line 1573 of file Execute.h.

Referenced by clearMetaInfoCache(), dumpCache(), getTableGeneration(), and setupCaching().

TableIdToNodeMap Executor::table_id_to_node_map_
private

Definition at line 1560 of file Execute.h.

const TemporaryTables* Executor::temporary_tables_
private

Definition at line 1559 of file Execute.h.

Referenced by getTemporaryTables().

constexpr ExecutorId Executor::UNITARY_EXECUTOR_ID = 0
static

Definition at line 423 of file Execute.h.

Referenced by acquireExecuteMutex(), checkNonKernelTimeInterrupted(), Parser::OptimizeTableStmt::execute(), Parser::CopyTableStmt::execute(), Parser::InsertValuesStmt::execute(), DBHandler::execute_rel_alg(), QueryRunner::QueryRunner::extractQueryPlanDag(), DBHandler::get_queries_info(), Fragmenter_Namespace::FragmentInfo::getChunkMetadataMap(), QueryRunner::QueryRunner::getExecutor(), Parser::LocalQueryConnector::getOuterFragmentCount(), QueryRunner::QueryRunner::getParsedGlobalQueryHints(), QueryRunner::QueryRunner::getParsedQueryHint(), QueryRunner::QueryRunner::getParsedQueryHints(), DBHandler::getQueries(), QueryRunner::QueryRunner::getQueryInfoForDataRecyclerTest(), QueryRunner::QueryRunner::getRaExecutionSequence(), QueryRunner::QueryRunner::getRootNodeFromParsedQuery(), DBHandler::import_table(), import_export::Importer::importDelimited(), import_export::Importer::importGDALGeo(), import_export::Importer::importGDALRaster(), DBHandler::importGeoTableSingle(), DBHandler::interrupt(), DBHandler::interruptQuery(), DBHandler::invalidate_cur_session(), anonymous_namespace{DBHandler.cpp}::log_cache_size(), migrations::MigrationMgr::migrateDateInDaysMetadata(), Parser::InsertIntoTableAsSelectStmt::populateData(), Parser::LocalQueryConnector::query(), QueryRunner::anonymous_namespace{QueryRunner.cpp}::run_select_query_with_filter_push_down(), QueryRunner::QueryRunner::runSQLWithAllowingInterrupt(), DBHandler::set_cur_session(), DBHandler::sql_execute_impl(), and anonymous_namespace{DdlCommandExecutor.cpp}::vacuum_table_if_required().

std::unique_ptr<WindowProjectNodeContext> Executor::window_project_node_context_owned_
private

Definition at line 1567 of file Execute.h.


The documentation for this class was generated from the following files: