_query_fragment_descriptor_8h_source.html

 /*

  * Copyright 2022 HEAVY.AI, Inc.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #pragma once


 #include <deque>

 #include <functional>

 #include <map>

 #include <memory>

 #include <optional>

 #include <ostream>

 #include <set>

 #include <unordered_map>

 #include <vector>


 #include "DataMgr/ChunkMetadata.h"

 #include "Logger/Logger.h"

 #include "QueryEngine/CompilationOptions.h"

 #include "Shared/DbObjectKeys.h"


 namespace Fragmenter_Namespace {

 class FragmentInfo;

 }


 namespace Data_Namespace {

 struct MemoryInfo;

 }


 class Executor;

 class InputDescriptor;

 struct InputTableInfo;

 struct RelAlgExecutionUnit;


 struct FragmentsPerTable {

   shared::TableKey table_key;

   std::vector<size_t> fragment_ids;

 };


 using FragmentsList = std::vector<FragmentsPerTable>;

 using TableFragments = std::vector<Fragmenter_Namespace::FragmentInfo>;


 struct ExecutionKernelDescriptor {

   int device_id;

   FragmentsList fragments;

   std::optional<size_t> outer_tuple_count;  // only for fragments with an exact tuple

                                             // count available in metadata

 };


 class QueryFragmentDescriptor {

  public:

   QueryFragmentDescriptor(const RelAlgExecutionUnit& ra_exe_unit,

                           const std::vector<InputTableInfo>& query_infos,

                           const std::vector<Data_Namespace::MemoryInfo>& gpu_mem_infos,

                           const double gpu_input_mem_limit_percent,

                           const std::vector<size_t> allowed_outer_fragment_indices);


   static void computeAllTablesFragments(

       std::map<shared::TableKey, const TableFragments*>& all_tables_fragments,

       const RelAlgExecutionUnit& ra_exe_unit,

       const std::vector<InputTableInfo>& query_infos);


   void buildFragmentKernelMap(const RelAlgExecutionUnit& ra_exe_unit,

                               const std::vector<uint64_t>& frag_offsets,

                               const int device_count,

                               const ExecutorDeviceType& device_type,

                               const bool enable_multifrag_kernels,

                               const bool enable_inner_join_fragment_skipping,

                               Executor* executor);


   template <typename DISPATCH_FCN>

   void assignFragsToMultiDispatch(DISPATCH_FCN f) const {

     for (const auto& device_itr : execution_kernels_per_device_) {

       const auto& execution_kernels = device_itr.second;

       CHECK_EQ(execution_kernels.size(), size_t(1));


       const auto& fragments_list = execution_kernels.front().fragments;

       f(device_itr.first, fragments_list, rowid_lookup_key_);

     }

   }


   template <typename DISPATCH_FCN>

   void assignFragsToKernelDispatch(DISPATCH_FCN f,

                                    const RelAlgExecutionUnit& ra_exe_unit) const {

     if (execution_kernels_per_device_.empty()) {

       return;

     }


     size_t tuple_count = 0;


     std::unordered_map<int, size_t> execution_kernel_index;

     for (const auto& device_itr : execution_kernels_per_device_) {

       CHECK(execution_kernel_index.insert(std::make_pair(device_itr.first, size_t(0)))

                 .second);

     }


     bool dispatch_finished = false;

     while (!dispatch_finished) {

       dispatch_finished = true;

       for (const auto& device_itr : execution_kernels_per_device_) {

         auto& kernel_idx = execution_kernel_index[device_itr.first];

         if (kernel_idx < device_itr.second.size()) {

           dispatch_finished = false;

           const auto& execution_kernel = device_itr.second[kernel_idx++];

           f(device_itr.first, execution_kernel.fragments, rowid_lookup_key_);

           if (terminateDispatchMaybe(tuple_count, ra_exe_unit, execution_kernel)) {

             return;

           }

         }

       }

     }

   }


   bool shouldCheckWorkUnitWatchdog() const {

     return rowid_lookup_key_ < 0 && !execution_kernels_per_device_.empty();

   }


  protected:

   std::vector<size_t> allowed_outer_fragment_indices_;

   size_t outer_fragments_size_ = 0;

   int64_t rowid_lookup_key_ = -1;


   std::map<shared::TableKey, const TableFragments*> selected_tables_fragments_;


   std::map<int, std::vector<ExecutionKernelDescriptor>> execution_kernels_per_device_;


   double gpu_input_mem_limit_percent_;

   std::map<size_t, size_t> tuple_count_per_device_;

   std::map<size_t, size_t> available_gpu_mem_bytes_;


   void buildFragmentPerKernelMapForUnion(const RelAlgExecutionUnit& ra_exe_unit,

                                          const std::vector<uint64_t>& frag_offsets,

                                          const int device_count,

                                          const size_t num_bytes_for_row,

                                          const ExecutorDeviceType& device_type,

                                          Executor* executor);


   void buildFragmentPerKernelMap(const RelAlgExecutionUnit& ra_exe_unit,

                                  const std::vector<uint64_t>& frag_offsets,

                                  const int device_count,

                                  const size_t num_bytes_for_row,

                                  const ExecutorDeviceType& device_type,

                                  Executor* executor);


   void buildMultifragKernelMap(const RelAlgExecutionUnit& ra_exe_unit,

                                const std::vector<uint64_t>& frag_offsets,

                                const int device_count,

                                const size_t num_bytes_for_row,

                                const ExecutorDeviceType& device_type,

                                const bool enable_inner_join_fragment_skipping,

                                Executor* executor);


   void buildFragmentPerKernelForTable(

       const TableFragments* fragments,

       const RelAlgExecutionUnit& ra_exe_unit,

       const InputDescriptor& table_desc,

       const bool is_temporary_table,

       const std::vector<uint64_t>& frag_offsets,

       const int device_count,

       const size_t num_bytes_for_row,

       const ChunkMetadataVector& deleted_chunk_metadata_vec,

       const std::optional<size_t> table_desc_offset,

       const ExecutorDeviceType& device_type,

       Executor* executor);


   bool terminateDispatchMaybe(size_t& tuple_count,

                               const RelAlgExecutionUnit& ra_exe_unit,

                               const ExecutionKernelDescriptor& kernel) const;


   void checkDeviceMemoryUsage(const Fragmenter_Namespace::FragmentInfo& fragment,

                               const int device_id,

                               const size_t num_cols);

 };


 std::ostream& operator<<(std::ostream&, FragmentsPerTable const&);

CHECK_EQ
#define CHECK_EQ(x, y)
Definition: Logger.h:301

QueryFragmentDescriptor::QueryFragmentDescriptor
QueryFragmentDescriptor(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const std::vector< Data_Namespace::MemoryInfo > &gpu_mem_infos, const double gpu_input_mem_limit_percent, const std::vector< size_t > allowed_outer_fragment_indices)
Definition: QueryFragmentDescriptor.cpp:25

QueryFragmentDescriptor
Definition: QueryFragmentDescriptor.h:68

ExecutionKernelDescriptor::outer_tuple_count
std::optional< size_t > outer_tuple_count
Definition: QueryFragmentDescriptor.h:64

QueryFragmentDescriptor::tuple_count_per_device_
std::map< size_t, size_t > tuple_count_per_device_
Definition: QueryFragmentDescriptor.h:156

QueryFragmentDescriptor::terminateDispatchMaybe
bool terminateDispatchMaybe(size_t &tuple_count, const RelAlgExecutionUnit &ra_exe_unit, const ExecutionKernelDescriptor &kernel) const
Definition: QueryFragmentDescriptor.cpp:461

QueryFragmentDescriptor::rowid_lookup_key_
int64_t rowid_lookup_key_
Definition: QueryFragmentDescriptor.h:149

Catalog_Namespace::operator<<
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57

TableFragments
std::vector< Fragmenter_Namespace::FragmentInfo > TableFragments
Definition: QueryFragmentDescriptor.h:59

QueryFragmentDescriptor::assignFragsToKernelDispatch
void assignFragsToKernelDispatch(DISPATCH_FCN f, const RelAlgExecutionUnit &ra_exe_unit) const
Definition: QueryFragmentDescriptor.h:111

Executor
Definition: Execute.h:415

ExecutionKernelDescriptor::device_id
int device_id
Definition: QueryFragmentDescriptor.h:62

QueryFragmentDescriptor::buildFragmentPerKernelMap
void buildFragmentPerKernelMap(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, Executor *executor)
Definition: QueryFragmentDescriptor.cpp:282

FragmentsList
std::vector< FragmentsPerTable > FragmentsList
Definition: QueryFragmentDescriptor.h:58

ExecutorDeviceType
ExecutorDeviceType
Definition: ExecutorDeviceType.h:23

InputDescriptor
Definition: InputDescriptors.h:30

QueryFragmentDescriptor::gpu_input_mem_limit_percent_
double gpu_input_mem_limit_percent_
Definition: QueryFragmentDescriptor.h:155

ChunkMetadata.h

QueryFragmentDescriptor::shouldCheckWorkUnitWatchdog
bool shouldCheckWorkUnitWatchdog() const
Definition: QueryFragmentDescriptor.h:142

QueryFragmentDescriptor::execution_kernels_per_device_
std::map< int, std::vector< ExecutionKernelDescriptor > > execution_kernels_per_device_
Definition: QueryFragmentDescriptor.h:153

Fragmenter_Namespace::FragmentInfo
Used by Fragmenter classes to store info about each fragment - the fragment id and number of tuples(r...
Definition: Fragmenter.h:86

Logger.h

DbObjectKeys.h

QueryFragmentDescriptor::checkDeviceMemoryUsage
void checkDeviceMemoryUsage(const Fragmenter_Namespace::FragmentInfo &fragment, const int device_id, const size_t num_cols)
Definition: QueryFragmentDescriptor.cpp:479

ChunkMetadataVector
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
Definition: ChunkMetadata.h:201

QueryFragmentDescriptor::outer_fragments_size_
size_t outer_fragments_size_
Definition: QueryFragmentDescriptor.h:148

QueryFragmentDescriptor::computeAllTablesFragments
static void computeAllTablesFragments(std::map< shared::TableKey, const TableFragments * > &all_tables_fragments, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos)
Definition: QueryFragmentDescriptor.cpp:49

QueryFragmentDescriptor::buildMultifragKernelMap
void buildMultifragKernelMap(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, const bool enable_inner_join_fragment_skipping, Executor *executor)
Definition: QueryFragmentDescriptor.cpp:349

ExecutionKernelDescriptor::fragments
FragmentsList fragments
Definition: QueryFragmentDescriptor.h:63

FragmentsPerTable
Definition: QueryFragmentDescriptor.h:53

FragmentsPerTable::table_key
shared::TableKey table_key
Definition: QueryFragmentDescriptor.h:54

ExecutionKernelDescriptor
Definition: QueryFragmentDescriptor.h:61

f
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
Definition: TestTorchTableFunctions.cpp:103

QueryFragmentDescriptor::assignFragsToMultiDispatch
void assignFragsToMultiDispatch(DISPATCH_FCN f) const
Definition: QueryFragmentDescriptor.h:94

QueryFragmentDescriptor::buildFragmentPerKernelForTable
void buildFragmentPerKernelForTable(const TableFragments *fragments, const RelAlgExecutionUnit &ra_exe_unit, const InputDescriptor &table_desc, const bool is_temporary_table, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ChunkMetadataVector &deleted_chunk_metadata_vec, const std::optional< size_t > table_desc_offset, const ExecutorDeviceType &device_type, Executor *executor)
Definition: QueryFragmentDescriptor.cpp:107

CHECK
#define CHECK(condition)
Definition: Logger.h:291

QueryFragmentDescriptor::allowed_outer_fragment_indices_
std::vector< size_t > allowed_outer_fragment_indices_
Definition: QueryFragmentDescriptor.h:147

CompilationOptions.h

QueryFragmentDescriptor::buildFragmentPerKernelMapForUnion
void buildFragmentPerKernelMapForUnion(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, Executor *executor)
Definition: QueryFragmentDescriptor.cpp:218

InputTableInfo
Definition: InputMetadata.h:33

FragmentsPerTable::fragment_ids
std::vector< size_t > fragment_ids
Definition: QueryFragmentDescriptor.h:55

QueryFragmentDescriptor::selected_tables_fragments_
std::map< shared::TableKey, const TableFragments * > selected_tables_fragments_
Definition: QueryFragmentDescriptor.h:151

QueryFragmentDescriptor::buildFragmentKernelMap
void buildFragmentKernelMap(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const ExecutorDeviceType &device_type, const bool enable_multifrag_kernels, const bool enable_inner_join_fragment_skipping, Executor *executor)
Definition: QueryFragmentDescriptor.cpp:63

shared::TableKey
Definition: DbObjectKeys.h:51

QueryFragmentDescriptor::available_gpu_mem_bytes_
std::map< size_t, size_t > available_gpu_mem_bytes_
Definition: QueryFragmentDescriptor.h:157

RelAlgExecutionUnit
Definition: RelAlgExecutionUnit.h:165