_query_plan_dag_cache_8h_source.html

 /*

  * Copyright 2022 HEAVY.AI, Inc.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #pragma once


 #include <boost/graph/adjacency_list.hpp>

 #include <boost/graph/graph_utility.hpp>

 #include <boost/graph/graphviz.hpp>

 #include <boost/graph/labeled_graph.hpp>


 #include <iostream>

 #include <memory>

 #include <vector>


 #include "RelAlgDag.h"

 #include "RelAlgExecutionUnit.h"

 #include "ScalarExprVisitor.h"

 #include "Visitors/RelRexDagVisitor.h"


 constexpr size_t MAX_NODE_CACHE_SIZE = 1e9;  // set ~1GB as cache size threshold


 // we manage the uniqueness of node ID by its explained contents that each rel node has

 using RelNodeMap = std::unordered_map<RelNodeExplainedHash, RelNodeId>;

 // we also maintain labeled graph to manage extracted query plan DAG

 // this can be used in a future to support advanced features such as partial resultset

 // reuse and compiled kernel reuse by exploiting graph-centric computation like subgraph

 // matching and graph isomorphism

 using QueryPlanDag = boost::labeled_graph<AdjacentList, RelNodeId, boost::hash_mapS>;


 class ColumnVarsVisitor

     : public ScalarExprVisitor<std::vector<const Analyzer::ColumnVar*>> {

  protected:

   std::vector<const Analyzer::ColumnVar*> visitColumnVar(

       const Analyzer::ColumnVar* column) const override {

     return {column};

   }


   std::vector<const Analyzer::ColumnVar*> visitColumnVarTuple(

       const Analyzer::ExpressionTuple* expr_tuple) const override {

     ColumnVarsVisitor visitor;

     std::vector<const Analyzer::ColumnVar*> result;

     for (size_t i = 0; i < expr_tuple->getTuple().size(); ++i) {

       const auto col_vars = visitor.visit(expr_tuple->getTuple()[i].get());

       for (const auto col_var : col_vars) {

         result.push_back(col_var);

       }

     }

     return result;

   }


   std::vector<const Analyzer::ColumnVar*> aggregateResult(

       const std::vector<const Analyzer::ColumnVar*>& aggregate,

       const std::vector<const Analyzer::ColumnVar*>& next_result) const override {

     auto result = aggregate;

     for (const auto col_var : next_result) {

       result.push_back(col_var);

     }

     return result;

   }

 };


 class ScanNodeTableKeyCollector final : public RelRexDagVisitor {

  public:

   using RelRexDagVisitor::visit;


   static std::unordered_set<size_t> getScanNodeTableKey(RelAlgNode const* rel_alg_node) {

     ScanNodeTableKeyCollector scan_node_table_key_collector;

     scan_node_table_key_collector.visit(rel_alg_node);

     return std::move(scan_node_table_key_collector.table_keys_);

   }


  private:

   void visit(RelScan const* scan_node) override {

     CHECK(scan_node->getTableDescriptor());

     CHECK(scan_node->getTableDescriptor()->fragmenter);

     auto hashed_chunk_key = boost::hash_value(scan_node->getTableDescriptor()

                                                   ->fragmenter->getFragmentsForQuery()

                                                   .chunkKeyPrefix);

     table_keys_.insert(hashed_chunk_key);

     RelRexDagVisitor::visit(scan_node);

   }


   std::unordered_set<size_t> table_keys_;

 };


 // This is one of main data structure for data recycling which manages a query plan shape

 // as a DAG representation

 // A query plan DAG is a sequence of unique node ID, and it means that we can assign the

 // same node ID to a node iff we already saw that node in a different query plan that we

 // extracted to retrieve a query plan DAG we visit each rel node of an input query plan

 // starting from the root to the bottom (left-to-right child visiting), and check whether

 // it is valid for DAG extraction and its usage (we do not allow dag extraction if a query

 // plan has not supported rel node for data recycling such as logical value) and if that

 // visited node is valid then we check its uniqueness against DAG cache and assign the

 // unique ID once it is unique one (otherwise we reuse node id) after visiting a query

 // plan we have a sequence of node IDs and return it as an extracted query plan DAG

 class QueryPlanDagCache {

  public:

   QueryPlanDagCache(size_t max_node_cache_size = MAX_NODE_CACHE_SIZE)

       : max_node_map_size_(max_node_cache_size) {}


   QueryPlanDagCache(QueryPlanDagCache&& other) = delete;

   QueryPlanDagCache& operator=(QueryPlanDagCache&& other) = delete;

   QueryPlanDagCache(const QueryPlanDagCache&) = delete;

   QueryPlanDagCache& operator=(const QueryPlanDagCache&) = delete;


   std::optional<RelNodeId> addNodeIfAbsent(const RelAlgNode*);


   void connectNodes(const RelNodeId parent_id, const RelNodeId child_id);


   std::vector<const Analyzer::ColumnVar*> collectColVars(const Analyzer::Expr* target);


   size_t getCurrentNodeMapSize() const;


   void setNodeMapMaxSize(const size_t map_size);


   size_t getJoinColumnsInfoHash(const Analyzer::Expr* join_expr,

                                 JoinColumnSide target_side,

                                 bool extract_only_col_id);


   size_t translateColVarsToInfoHash(std::vector<const Analyzer::ColumnVar*>& col_vars,

                                     bool col_id_only) const;


   void clearQueryPlanCache();


   void printDag();


  private:

   size_t getCurrentNodeMapSizeUnlocked() const;

   size_t getCurrentNodeMapCardinality() const;


   // a map btw. rel node and its unique node id

   RelNodeMap node_map_;

   // a graph structure that represents relationships among extracted query plan DAGs

   QueryPlanDag cached_query_plan_dag_;

   // a limitation of the maximum size of DAG cache (to prevent unlimited usage of memory

   // for DAG maintanence)

   size_t max_node_map_size_;


   // a lock to protect contentions while accessing internal data structure of DAG cache

   mutable std::mutex cache_lock_;

   ColumnVarsVisitor col_var_visitor_;

 };

QueryPlanDagCache
Definition: QueryPlanDagCache.h:110

RelRexDagVisitor::visit
virtual void visit(RelAlgNode const *)
Definition: RelRexDagVisitor.cpp:68

QueryPlanDagCache::addNodeIfAbsent
std::optional< RelNodeId > addNodeIfAbsent(const RelAlgNode *)
Definition: QueryPlanDagCache.cpp:25

ScanNodeTableKeyCollector
Definition: QueryPlanDagCache.h:75

QueryPlanDagCache::connectNodes
void connectNodes(const RelNodeId parent_id, const RelNodeId child_id)
Definition: QueryPlanDagCache.cpp:42

Analyzer::Expr
Definition: Analyzer.h:68

QueryPlanDagCache::clearQueryPlanCache
void clearQueryPlanCache()
Definition: QueryPlanDagCache.cpp:146

JoinColumnSide
JoinColumnSide
Definition: RelAlgExecutionUnit.h:97

RelRexDagVisitor.h

ScalarExprVisitor::visit
T visit(const Analyzer::Expr *expr) const
Definition: ScalarExprVisitor.h:25

QueryPlanDagCache::getCurrentNodeMapSizeUnlocked
size_t getCurrentNodeMapSizeUnlocked() const
Definition: QueryPlanDagCache.cpp:138

QueryPlanDagCache::getJoinColumnsInfoHash
size_t getJoinColumnsInfoHash(const Analyzer::Expr *join_expr, JoinColumnSide target_side, bool extract_only_col_id)
Definition: QueryPlanDagCache.cpp:78

ScanNodeTableKeyCollector::getScanNodeTableKey
static std::unordered_set< size_t > getScanNodeTableKey(RelAlgNode const *rel_alg_node)
Definition: QueryPlanDagCache.h:79

QueryPlanDagCache::col_var_visitor_
ColumnVarsVisitor col_var_visitor_
Definition: QueryPlanDagCache.h:155

Analyzer::ColumnVar
Definition: Analyzer.h:194

ScanNodeTableKeyCollector::visit
void visit(RelScan const *scan_node) override
Definition: QueryPlanDagCache.h:86

QueryPlanDagCache::cached_query_plan_dag_
QueryPlanDag cached_query_plan_dag_
Definition: QueryPlanDagCache.h:148

RelRexDagVisitor
Definition: RelRexDagVisitor.h:37

QueryPlanDagCache::max_node_map_size_
size_t max_node_map_size_
Definition: QueryPlanDagCache.h:151

Analyzer::ExpressionTuple::getTuple
const std::vector< std::shared_ptr< Analyzer::Expr > > & getTuple() const
Definition: Analyzer.h:253

QueryPlanDagCache::translateColVarsToInfoHash
size_t translateColVarsToInfoHash(std::vector< const Analyzer::ColumnVar * > &col_vars, bool col_id_only) const
Definition: QueryPlanDagCache.cpp:55

ColumnVarsVisitor::visitColumnVarTuple
std::vector< const Analyzer::ColumnVar * > visitColumnVarTuple(const Analyzer::ExpressionTuple *expr_tuple) const override
Definition: QueryPlanDagCache.h:51

ScalarExprVisitor.h

QueryPlanDagCache::printDag
void printDag()
Definition: QueryPlanDagCache.cpp:122

QueryPlanDagCache::cache_lock_
std::mutex cache_lock_
Definition: QueryPlanDagCache.h:154

Analyzer::ExpressionTuple
Definition: Analyzer.h:248

QueryPlanDagCache::getCurrentNodeMapCardinality
size_t getCurrentNodeMapCardinality() const
Definition: QueryPlanDagCache.cpp:142

RelAlgDag.h

TableDescriptor::fragmenter
std::shared_ptr< Fragmenter_Namespace::AbstractFragmenter > fragmenter
Definition: TableDescriptor.h:63

RelNodeMap
std::unordered_map< RelNodeExplainedHash, RelNodeId > RelNodeMap
Definition: QueryPlanDagCache.h:36

ColumnVarsVisitor::aggregateResult
std::vector< const Analyzer::ColumnVar * > aggregateResult(const std::vector< const Analyzer::ColumnVar * > &aggregate, const std::vector< const Analyzer::ColumnVar * > &next_result) const override
Definition: QueryPlanDagCache.h:64

QueryPlanDagCache::QueryPlanDagCache
QueryPlanDagCache(size_t max_node_cache_size=MAX_NODE_CACHE_SIZE)
Definition: QueryPlanDagCache.h:112

RelAlgNode
Definition: RelAlgDag.h:828

QueryPlanDag
boost::labeled_graph< AdjacentList, RelNodeId, boost::hash_mapS > QueryPlanDag
Definition: QueryPlanDagCache.h:41

MAX_NODE_CACHE_SIZE
constexpr size_t MAX_NODE_CACHE_SIZE
Definition: QueryPlanDagCache.h:33

QueryPlanDagCache::setNodeMapMaxSize
void setNodeMapMaxSize(const size_t map_size)
Definition: QueryPlanDagCache.cpp:50

QueryPlanDagCache::node_map_
RelNodeMap node_map_
Definition: QueryPlanDagCache.h:146

ColumnVarsVisitor::visitColumnVar
std::vector< const Analyzer::ColumnVar * > visitColumnVar(const Analyzer::ColumnVar *column) const override
Definition: QueryPlanDagCache.h:46

hash_value
std::size_t hash_value(RexAbstractInput const &rex_ab_input)
Definition: RelAlgDag.cpp:3548

ScalarExprVisitor
Definition: ScalarExprVisitor.h:23

CHECK
#define CHECK(condition)
Definition: Logger.h:291

QueryPlanDagCache::getCurrentNodeMapSize
size_t getCurrentNodeMapSize() const
Definition: QueryPlanDagCache.cpp:133

ColumnVarsVisitor
Definition: QueryPlanDagCache.h:43

ScanNodeTableKeyCollector::table_keys_
std::unordered_set< size_t > table_keys_
Definition: QueryPlanDagCache.h:96

RelAlgExecutionUnit.h
Execution unit for relational algebra. It&#39;s a low-level description of any relational algebra operati...

RelScan
Definition: RelAlgDag.h:1093

RelScan::getTableDescriptor
const TableDescriptor * getTableDescriptor() const
Definition: RelAlgDag.h:1117

QueryPlanDagCache::collectColVars
std::vector< const Analyzer::ColumnVar * > collectColVars(const Analyzer::Expr *target)
Definition: QueryPlanDagCache.cpp:152

RelNodeId
size_t RelNodeId
Definition: RelAlgExecutionUnit.h:50

run_benchmark_import.result
dictionary result
Definition: run_benchmark_import.py:441

QueryPlanDagCache::operator=
QueryPlanDagCache & operator=(QueryPlanDagCache &&other)=delete