_data_recycler_8h_source.html

 /*

  * Copyright 2022 HEAVY.AI, Inc.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #pragma once


 #include "Analyzer/Analyzer.h"

 #include "QueryEngine/ColumnarResults.h"

 #include "QueryEngine/Descriptors/InputDescriptors.h"

 #include "QueryEngine/Descriptors/RelAlgExecutionDescriptor.h"

 #include "QueryEngine/JoinHashTable/HashTable.h"

 #include "QueryEngine/RelAlgExecutionUnit.h"

 #include "QueryEngine/ResultSet.h"

 #include "Shared/heavyai_shared_mutex.h"

 #include "Shared/misc.h"


 #include <boost/functional/hash.hpp>


 #include <algorithm>

 #include <ostream>

 #include <unordered_map>


 struct EMPTY_META_INFO {};


 // Item type that we try to recycle

 enum CacheItemType {

   PERFECT_HT = 0,                   // Perfect hashtable

   BASELINE_HT,                      // Baseline hashtable

   BBOX_INTERSECT_HT,                // Bounding box intersect hashtable

   HT_HASHING_SCHEME,                // Hashtable layout

   BASELINE_HT_APPROX_CARD,          // Approximated cardinality for baseline hashtable

   BBOX_INTERSECT_AUTO_TUNER_PARAM,  // Bounding box intersect auto tuner's params

   QUERY_RESULTSET,                  // query resultset

   CHUNK_METADATA,                   // query resultset's chunk metadata

   // TODO (yoonmin): support the following items for recycling

   // COUNTALL_CARD_EST,  Cardinality of query result

   // NDV_CARD_EST,       # Non-distinct value

   // FILTER_SEL          Selectivity of (push-downed) filter node

   NUM_CACHE_ITEM_TYPE

 };


 inline std::ostream& operator<<(std::ostream& os, CacheItemType const item_type) {

   constexpr char const* cache_item_type_str[]{

       "Perfect Join Hashtable",

       "Baseline Join Hashtable",

       "Bounding Box Intersect Join Hashtable",

       "Hashing Scheme for Join Hashtable",

       "Baseline Join Hashtable's Approximated Cardinality",

       "Bounding Box Intersect Join Hashtable's Auto Tuner's Parameters",

       "Query ResultSet",

       "Chunk Metadata"};

   static_assert(sizeof(cache_item_type_str) / sizeof(*cache_item_type_str) ==

                 NUM_CACHE_ITEM_TYPE);

   return os << cache_item_type_str[item_type];

 }


 // given item to be cached, it represents whether the item can be cached when considering

 // various size limitation

 enum CacheAvailability {

   AVAILABLE,                // item can be cached as is

   AVAILABLE_AFTER_CLEANUP,  // item can be cached after removing already cached items

   UNAVAILABLE               // item cannot be cached due to size limitation

 };


 enum CacheUpdateAction { ADD, REMOVE };


 // the order of enum values affects how we remove cached items when

 // new item wants to be cached but there is not enough space to keep them

 // regarding `REF_COUNT`, it represents how many times a cached item is referenced during

 // its lifetime to numerically estimate the usefulness of this cached item

 // (not to measure exact # reference count at time T as std::shared_ptr does)

 enum CacheMetricType { REF_COUNT = 0, MEM_SIZE, COMPUTE_TIME, NUM_METRIC_TYPE };


 // per query plan DAG metric

 class CacheItemMetric {

  public:

   CacheItemMetric(QueryPlanHash query_plan_hash, size_t compute_time, size_t mem_size)

       : query_plan_hash_(query_plan_hash), metrics_({0, mem_size, compute_time}) {}


   QueryPlanHash getQueryPlanHash() const { return query_plan_hash_; }


   void incRefCount() { ++metrics_[CacheMetricType::REF_COUNT]; }


   size_t getRefCount() const { return metrics_[CacheMetricType::REF_COUNT]; }


   size_t getComputeTime() const { return metrics_[CacheMetricType::COMPUTE_TIME]; }


   size_t getMemSize() const { return metrics_[CacheMetricType::MEM_SIZE]; }


   const std::array<size_t, CacheMetricType::NUM_METRIC_TYPE>& getMetrics() const {

     return metrics_;

   }


   void setComputeTime(size_t compute_time) {

     metrics_[CacheMetricType::COMPUTE_TIME] = compute_time;

   }


   void setMemSize(const size_t mem_size) {

     metrics_[CacheMetricType::MEM_SIZE] = mem_size;

   }


   std::string toString() const {

     std::ostringstream oss;

     oss << "Query plan hash: " << query_plan_hash_

         << ", compute_time: " << metrics_[CacheMetricType::COMPUTE_TIME]

         << ", mem_size: " << metrics_[CacheMetricType::MEM_SIZE]

         << ", ref_count: " << metrics_[CacheMetricType::REF_COUNT];

     return oss.str();

   }


  private:

   const QueryPlanHash query_plan_hash_;

   std::array<size_t, CacheMetricType::NUM_METRIC_TYPE> metrics_;

 };


 // 0 = CPU, 1 ~ N : GPU-1 ~ GPU-N

 using DeviceIdentifier = size_t;

 using CacheSizeMap = std::unordered_map<DeviceIdentifier, size_t>;

 using CacheMetricInfoMap =

     std::unordered_map<DeviceIdentifier, std::vector<std::shared_ptr<CacheItemMetric>>>;


 class DataRecyclerUtil {

  public:

   static constexpr DeviceIdentifier CPU_DEVICE_IDENTIFIER = 0;


   static std::string getDeviceIdentifierString(DeviceIdentifier device_identifier) {

     std::string device_type = device_identifier == CPU_DEVICE_IDENTIFIER ? "CPU" : "GPU-";

     return device_identifier != CPU_DEVICE_IDENTIFIER

                ? device_type.append(std::to_string(device_identifier))

                : device_type;

   }


   static QueryPlanHash getUnitaryTableKey() {

     // C++20 will support constexpr vector

     // Before we have C++20, let's use a pre-computed constant which is retrieved from

     // std::vector<int> unitary_table_identifier = {-1, -1};

     // UNITARY_TABLE_ID_HASH_VALUE = boost::hash_value(unitary_table_identifier);

     constexpr QueryPlanHash UNITARY_TABLE_ID_HASH_VALUE = 1703092966009212028;

     return UNITARY_TABLE_ID_HASH_VALUE;

   }


   static std::unordered_set<size_t> getAlternativeTableKeys(

       const std::vector<ChunkKey>& chunk_keys,

       const shared::TableKey& inner_table_key) {

     std::unordered_set<size_t> alternative_table_keys;

     if (!chunk_keys.empty() && chunk_keys.front().size() > 2 &&

         chunk_keys.front()[1] > 0) {

       auto& chunk_key = chunk_keys.front();

       // the actual chunks fetched per device can be different but they constitute the

       // same table in the same db, so we can exploit this to create an alternative table

       // key

       std::vector<int> alternative_table_key{chunk_key[0], chunk_key[1]};

       alternative_table_keys.insert(boost::hash_value(alternative_table_key));

     } else if (inner_table_key.table_id > 0) {

       // use this path if chunk_keys is empty

       alternative_table_keys.insert(inner_table_key.hash());

     } else {

       // this can happen when we use synthetic table generated by table function such as

       // generate_series, i.e., SELECT ... FROM table(generate_series(...)) ...

       // then we try to manage them via predefined static chunk key

       // and remove them "all" when necessary

       alternative_table_keys.insert(DataRecyclerUtil::getUnitaryTableKey());

     }

     return alternative_table_keys;

   }

 };


 // contain information regarding 1) per-cache item metric: perfect ht-1, perfect ht-2,

 // baseline ht-1, ... and 2) per-type size in current: perfect-ht cache size, baseline-ht

 // cache size, ...

 class CacheMetricTracker {

  public:

   CacheMetricTracker(CacheItemType cache_item_type,

                      size_t total_cache_size,

                      size_t max_cache_item_size,

                      int num_gpus = 0)

       : item_type_(cache_item_type)

       , total_cache_size_(total_cache_size)

       , max_cache_item_size_(max_cache_item_size) {

     // initialize cache metrics for each device: CPU, GPU0, GPU1, ...

     // Currently we only consider maintaining our cache in CPU-memory

     for (int gpu_device_identifier = num_gpus; gpu_device_identifier >= 1;

          --gpu_device_identifier) {

       cache_metrics_.emplace(gpu_device_identifier,

                              std::vector<std::shared_ptr<CacheItemMetric>>());

       current_cache_size_in_bytes_.emplace(gpu_device_identifier, 0);

     }

     cache_metrics_.emplace(DataRecyclerUtil::CPU_DEVICE_IDENTIFIER,

                            std::vector<std::shared_ptr<CacheItemMetric>>());

     current_cache_size_in_bytes_.emplace(DataRecyclerUtil::CPU_DEVICE_IDENTIFIER, 0);


     if (total_cache_size_ < 1024 * 1024 * 256) {

       LOG(INFO) << "The total cache size of " << cache_item_type

                 << " is set too low, so we suggest raising it larger than 256MB";

     }


     if (max_cache_item_size < 1024 * 1024 * 10) {

       LOG(INFO)

           << "The maximum item size of " << cache_item_type

           << " that can be cached is set too low, we suggest raising it larger than 10MB";

     }

     if (max_cache_item_size > total_cache_size_) {

       LOG(INFO) << "The maximum item size of " << cache_item_type

                 << " is set larger than its total cache size, so we force to set the "

                    "maximum item size as equal to the total cache size";

       max_cache_item_size = total_cache_size_;

     }

   }


   static inline CacheMetricInfoMap::mapped_type::const_iterator getCacheItemMetricItr(

       QueryPlanHash key,

       CacheMetricInfoMap::mapped_type const& metrics) {

     auto same_hash = [key](auto itr) { return itr->getQueryPlanHash() == key; };

     return std::find_if(metrics.cbegin(), metrics.cend(), same_hash);

   }


   static inline std::shared_ptr<CacheItemMetric> getCacheItemMetricImpl(

       QueryPlanHash key,

       CacheMetricInfoMap::mapped_type const& metrics) {

     auto itr = getCacheItemMetricItr(key, metrics);

     return itr == metrics.cend() ? nullptr : *itr;

   }


   std::vector<std::shared_ptr<CacheItemMetric>>& getCacheItemMetrics(

       DeviceIdentifier device_identifier) {

     auto itr = cache_metrics_.find(device_identifier);

     CHECK(itr != cache_metrics_.end());

     return itr->second;

   }


   std::shared_ptr<CacheItemMetric> getCacheItemMetric(

       QueryPlanHash key,

       DeviceIdentifier device_identifier) const {

     auto itr = cache_metrics_.find(device_identifier);

     return itr == cache_metrics_.cend() ? nullptr

                                         : getCacheItemMetricImpl(key, itr->second);

   }


   void setCurrentCacheSize(DeviceIdentifier device_identifier, size_t bytes) {

     if (bytes > total_cache_size_) {

       return;

     }

     auto itr = current_cache_size_in_bytes_.find(device_identifier);

     CHECK(itr != current_cache_size_in_bytes_.end());

     itr->second = bytes;

   }


   std::optional<size_t> getCurrentCacheSize(DeviceIdentifier key) const {

     auto same_hash = [key](auto itr) { return itr.first == key; };

     auto itr = std::find_if(current_cache_size_in_bytes_.cbegin(),

                             current_cache_size_in_bytes_.cend(),

                             same_hash);

     return itr == current_cache_size_in_bytes_.cend() ? std::nullopt

                                                       : std::make_optional(itr->second);

   }


   std::shared_ptr<CacheItemMetric> putNewCacheItemMetric(

       QueryPlanHash key,

       DeviceIdentifier device_identifier,

       size_t mem_size,

       size_t compute_time) {

     auto itr = cache_metrics_.find(device_identifier);

     CHECK(itr != cache_metrics_.end());

     if (auto cached_metric = getCacheItemMetricImpl(key, itr->second)) {

       if (cached_metric->getMemSize() != mem_size) {

         updateCurrentCacheSize(

             device_identifier, CacheUpdateAction::REMOVE, cached_metric->getMemSize());

         removeCacheItemMetric(key, device_identifier);

       } else {

         cached_metric->incRefCount();

         return cached_metric;

       }

     }

     auto cache_metric = std::make_shared<CacheItemMetric>(key, compute_time, mem_size);

     updateCurrentCacheSize(device_identifier, CacheUpdateAction::ADD, mem_size);

     // we add the item to cache after we create it during query runtime

     // so it is used at least once

     cache_metric->incRefCount();

     return itr->second.emplace_back(std::move(cache_metric));

   }


   void removeCacheItemMetric(QueryPlanHash key, DeviceIdentifier device_identifier) {

     auto& cache_metrics = getCacheItemMetrics(device_identifier);

     auto itr = getCacheItemMetricItr(key, cache_metrics);

     if (itr != cache_metrics.cend()) {

       cache_metrics.erase(itr);

     }

   }


   void removeMetricFromBeginning(DeviceIdentifier device_identifier, int offset) {

     auto metrics = getCacheItemMetrics(device_identifier);

     metrics.erase(metrics.begin(), metrics.begin() + offset);

   }


   size_t calculateRequiredSpaceForItemAddition(DeviceIdentifier device_identifier,

                                                size_t item_size) const {

     auto it = current_cache_size_in_bytes_.find(device_identifier);

     CHECK(it != current_cache_size_in_bytes_.end());

     CHECK_LE(item_size, total_cache_size_);

     const auto current_cache_size = it->second;

     long rem = total_cache_size_ - current_cache_size;

     return rem < 0 ? item_size : item_size - rem;

   }


   void clearCacheMetricTracker() {

     for (auto& kv : current_cache_size_in_bytes_) {

       auto cache_item_metrics = getCacheItemMetrics(kv.first);

       if (kv.first > 0) {

         VLOG(1) << "[" << item_type_ << "]"

                 << "] clear cache metrics (# items: " << kv.first << ", " << kv.second

                 << " bytes)";

       }

       updateCurrentCacheSize(kv.first, CacheUpdateAction::REMOVE, kv.second);

       CHECK_EQ(getCurrentCacheSize(kv.first).value(), 0u);

     }

     for (auto& kv : cache_metrics_) {

       kv.second.clear();

     }

   }


   CacheAvailability canAddItem(DeviceIdentifier device_identifier,

                                size_t item_size) const {

     if (item_size > max_cache_item_size_ || item_size > total_cache_size_) {

       return CacheAvailability::UNAVAILABLE;

     }

     // now we know that a cache can hold the new item since its size is less than

     // per-item maximum size limit

     // check if we need to remove some (or all) of cached item to make a room

     // for the new item

     auto current_cache_size = getCurrentCacheSize(device_identifier);

     CHECK(current_cache_size.has_value());

     auto cache_size_after_addition = *current_cache_size + item_size;

     if (cache_size_after_addition > total_cache_size_) {

       // if so, we need to remove the item to hold the new one within the cache

       return CacheAvailability::AVAILABLE_AFTER_CLEANUP;

     }

     // cache has a sufficient space to hold the new item

     // thus, there is no need to remove cached item

     return CacheAvailability::AVAILABLE;

   }


   void updateCurrentCacheSize(DeviceIdentifier device_identifier,

                               CacheUpdateAction action,

                               size_t size) {

     auto current_cache_size = getCurrentCacheSize(device_identifier);

     CHECK(current_cache_size.has_value());

     if (action == CacheUpdateAction::ADD) {

       setCurrentCacheSize(device_identifier, current_cache_size.value() + size);

     } else {

       CHECK_EQ(action, CacheUpdateAction::REMOVE);

       CHECK_LE(size, *current_cache_size);

       setCurrentCacheSize(device_identifier, current_cache_size.value() - size);

     }

   }


   void sortCacheInfoByQueryMetric(DeviceIdentifier device_identifier) {

     auto& metric_cache = getCacheItemMetrics(device_identifier);

     std::sort(metric_cache.begin(),

               metric_cache.end(),

               [](const std::shared_ptr<CacheItemMetric>& left,

                  const std::shared_ptr<CacheItemMetric>& right) {

                 auto& elem1_metrics = left->getMetrics();

                 auto& elem2_metrics = right->getMetrics();

                 for (size_t i = 0; i < CacheMetricType::NUM_METRIC_TYPE; ++i) {

                   if (elem1_metrics[i] != elem2_metrics[i]) {

                     return elem1_metrics[i] < elem2_metrics[i];

                   }

                 }

                 return false;

               });

   }


   std::string toString() const {

     std::ostringstream oss;

     oss << "Current memory consumption of caches for each device:\n";

     for (auto& kv : current_cache_size_in_bytes_) {

       oss << "\t\tDevice " << kv.first << " : " << kv.second << " bytes\n";

     }

     return oss.str();

   }


   size_t getTotalCacheSize() const { return total_cache_size_; }

   size_t getMaxCacheItemSize() const { return max_cache_item_size_; }

   void setTotalCacheSize(size_t new_total_cache_size) {

     if (new_total_cache_size > 0) {

       total_cache_size_ = new_total_cache_size;

     }

   }

   void setMaxCacheItemSize(size_t new_max_cache_item_size) {

     if (new_max_cache_item_size > 0) {

       max_cache_item_size_ = new_max_cache_item_size;

     }

   }


  private:

   CacheItemType item_type_;

   size_t total_cache_size_;

   size_t max_cache_item_size_;

   // metadata of cached item that belongs to a cache of a specific device

   // 1) ref_count: how many times this cached item is recycled

   // 2) memory_usage: the size of cached item in bytes

   // 3) compute_time: an elapsed time to generate this cached item

   CacheMetricInfoMap cache_metrics_;


   // the total amount of currently cached data per device

   CacheSizeMap current_cache_size_in_bytes_;

 };


 template <typename CACHED_ITEM_TYPE, typename META_INFO_TYPE>

 struct CachedItem {

   CachedItem(QueryPlanHash hashed_plan,

              CACHED_ITEM_TYPE item,

              std::shared_ptr<CacheItemMetric> item_metric_ptr,

              std::optional<META_INFO_TYPE> metadata = std::nullopt)

       : key(hashed_plan)

       , cached_item(item)

       , item_metric(item_metric_ptr)

       , meta_info(metadata)

       , dirty(false) {}


   void setDirty() { dirty = true; }

   bool isDirty() const { return dirty; }


   QueryPlanHash key;

   CACHED_ITEM_TYPE cached_item;

   std::shared_ptr<CacheItemMetric> item_metric;

   std::optional<META_INFO_TYPE> meta_info;

   bool dirty;

 };


 // A main class of data recycler

 // note that some tests which directly accesses APIs for update/modify/delete

 // (meta)data may need to disable data recycler explicitly before running test suites

 // to make test scenarios as expected

 // i.e., UpdelStorageTest that calls fragmenter's updateColumn API

 template <typename CACHED_ITEM_TYPE, typename META_INFO_TYPE>

 class DataRecycler {

  public:

   using CachedItemContainer = std::vector<CachedItem<CACHED_ITEM_TYPE, META_INFO_TYPE>>;

   using PerDeviceCacheItemContainer =

       std::unordered_map<DeviceIdentifier, std::shared_ptr<CachedItemContainer>>;

   using PerTypeCacheItemContainer =

       std::unordered_map<CacheItemType, std::shared_ptr<PerDeviceCacheItemContainer>>;

   using PerTypeCacheMetricTracker = std::unordered_map<CacheItemType, CacheMetricTracker>;


   DataRecycler(const std::vector<CacheItemType>& item_types,

                size_t total_cache_size,

                size_t max_item_size,

                int num_gpus) {

     for (auto& item_type : item_types) {

       cache_item_types_.insert(item_type);

       metric_tracker_.emplace(

           item_type,

           CacheMetricTracker(item_type, total_cache_size, max_item_size, num_gpus));

       auto item_container = std::make_shared<PerDeviceCacheItemContainer>();

       for (int gpu_device_identifier = num_gpus; gpu_device_identifier >= 1;

            --gpu_device_identifier) {

         item_container->emplace(gpu_device_identifier,

                                 std::make_shared<CachedItemContainer>());

       }

       item_container->emplace(DataRecyclerUtil::CPU_DEVICE_IDENTIFIER,

                               std::make_shared<CachedItemContainer>());

       cached_items_container_.emplace(item_type, item_container);

     }

   }


   virtual ~DataRecycler() = default;


   virtual CACHED_ITEM_TYPE getItemFromCache(

       QueryPlanHash key,

       CacheItemType item_type,

       DeviceIdentifier device_identifier,

       std::optional<META_INFO_TYPE> meta_info = std::nullopt) = 0;


   virtual void putItemToCache(QueryPlanHash key,

                               CACHED_ITEM_TYPE item_ptr,

                               CacheItemType item_type,

                               DeviceIdentifier device_identifier,

                               size_t item_size,

                               size_t compute_time,

                               std::optional<META_INFO_TYPE> meta_info = std::nullopt) = 0;


   virtual void initCache() = 0;


   virtual void clearCache() = 0;


   virtual void markCachedItemAsDirty(size_t table_key,

                                      std::unordered_set<QueryPlanHash>& key_set,

                                      CacheItemType item_type,

                                      DeviceIdentifier device_identifier) = 0;


   void markCachedItemAsDirtyImpl(QueryPlanHash key, CachedItemContainer& m) const {

     auto candidate_it = std::find_if(

         m.begin(),

         m.end(),

         [&key](const CachedItem<CACHED_ITEM_TYPE, META_INFO_TYPE>& cached_item) {

           return cached_item.key == key;

         });

     if (candidate_it != m.end()) {

       candidate_it->setDirty();

     }

   }


   bool isCachedItemDirty(QueryPlanHash key, CachedItemContainer& m) const {

     auto candidate_it = std::find_if(

         m.begin(),

         m.end(),

         [&key](const CachedItem<CACHED_ITEM_TYPE, META_INFO_TYPE>& cached_item) {

           return cached_item.key == key;

         });

     return candidate_it != m.end() && candidate_it->isDirty();

   }


   virtual std::string toString() const = 0;


   std::shared_ptr<CachedItemContainer> getCachedItemContainer(

       CacheItemType item_type,

       DeviceIdentifier device_identifier) const {

     auto item_type_container_itr = cached_items_container_.find(item_type);

     if (item_type_container_itr != cached_items_container_.end()) {

       auto device_type_container_itr =

           item_type_container_itr->second->find(device_identifier);

       return device_type_container_itr != item_type_container_itr->second->end()

                  ? device_type_container_itr->second

                  : nullptr;

     }

     return nullptr;

   }


   std::optional<CachedItem<CACHED_ITEM_TYPE, META_INFO_TYPE>>

   getCachedItemWithoutConsideringMetaInfo(QueryPlanHash key,

                                           CacheItemType item_type,

                                           DeviceIdentifier device_identifier,

                                           CachedItemContainer& m,

                                           std::lock_guard<std::mutex>& lock) {

     auto candidate_it = std::find_if(

         m.begin(),

         m.end(),

         [&key](const CachedItem<CACHED_ITEM_TYPE, META_INFO_TYPE>& cached_item) {

           return cached_item.key == key;

         });

     if (candidate_it != m.end()) {

       if (candidate_it->isDirty()) {

         removeItemFromCache(

             key, item_type, device_identifier, lock, candidate_it->meta_info);

         return std::nullopt;

       }

       return *candidate_it;

     }

     return std::nullopt;

   }


   size_t getCurrentNumCachedItems(CacheItemType item_type,

                                   DeviceIdentifier device_identifier) const {

     std::lock_guard<std::mutex> lock(cache_lock_);

     auto container = getCachedItemContainer(item_type, device_identifier);

     return container ? container->size() : 0;

   }


   size_t getCurrentNumDirtyCachedItems(CacheItemType item_type,

                                        DeviceIdentifier device_identifier) const {

     std::lock_guard<std::mutex> lock(cache_lock_);

     auto container = getCachedItemContainer(item_type, device_identifier);

     return std::count_if(container->begin(),

                          container->end(),

                          [](const auto& cached_item) { return cached_item.isDirty(); });

   }


   size_t getCurrentNumCleanCachedItems(CacheItemType item_type,

                                        DeviceIdentifier device_identifier) const {

     std::lock_guard<std::mutex> lock(cache_lock_);

     auto container = getCachedItemContainer(item_type, device_identifier);

     return std::count_if(container->begin(),

                          container->end(),

                          [](const auto& cached_item) { return !cached_item.isDirty(); });

   }


   size_t getCurrentCacheSizeForDevice(CacheItemType item_type,

                                       DeviceIdentifier device_identifier) const {

     std::lock_guard<std::mutex> lock(cache_lock_);

     auto metric_tracker = getMetricTracker(item_type);

     auto current_size_opt = metric_tracker.getCurrentCacheSize(device_identifier);

     return current_size_opt ? current_size_opt.value() : 0;

   }


   std::shared_ptr<CacheItemMetric> getCachedItemMetric(CacheItemType item_type,

                                                        DeviceIdentifier device_identifier,

                                                        QueryPlanHash key) const {

     std::lock_guard<std::mutex> lock(cache_lock_);

     auto cache_metric_tracker = getMetricTracker(item_type);

     return cache_metric_tracker.getCacheItemMetric(key, device_identifier);

   }


   void setTotalCacheSize(CacheItemType item_type, size_t new_total_cache_size) {

     if (new_total_cache_size > 0) {

       std::lock_guard<std::mutex> lock(cache_lock_);

       getMetricTracker(item_type).setTotalCacheSize(new_total_cache_size);

     }

   }


   void setMaxCacheItemSize(CacheItemType item_type, size_t new_max_cache_item_size) {

     if (new_max_cache_item_size > 0) {

       std::lock_guard<std::mutex> lock(cache_lock_);

       getMetricTracker(item_type).setMaxCacheItemSize(new_max_cache_item_size);

     }

   }


  protected:

   void removeCachedItemFromBeginning(CacheItemType item_type,

                                      DeviceIdentifier device_identifier,

                                      int offset) {

     // it removes cached items located from `idx 0` to `offset`

     // so, call this function after sorting the cached items container vec

     // and we should call this function under the proper locking scheme

     auto container = getCachedItemContainer(item_type, device_identifier);

     CHECK(container);

     container->erase(container->begin(), container->begin() + offset);

   }


   void sortCacheContainerByQueryMetric(CacheItemType item_type,

                                        DeviceIdentifier device_identifier) {

     // should call this function under the proper locking scheme

     auto container = getCachedItemContainer(item_type, device_identifier);

     CHECK(container);

     std::sort(container->begin(),

               container->end(),

               [](const CachedItem<CACHED_ITEM_TYPE, META_INFO_TYPE>& left,

                  const CachedItem<CACHED_ITEM_TYPE, META_INFO_TYPE>& right) {

                 auto& left_metrics = left.item_metric->getMetrics();

                 auto& right_metrics = right.item_metric->getMetrics();

                 for (size_t i = 0; i < CacheMetricType::NUM_METRIC_TYPE; ++i) {

                   if (left_metrics[i] != right_metrics[i]) {

                     return left_metrics[i] < right_metrics[i];

                   }

                 }

                 return false;

               });

   }


   std::mutex& getCacheLock() const { return cache_lock_; }


   CacheMetricTracker& getMetricTracker(CacheItemType item_type) {

     auto metric_iter = metric_tracker_.find(item_type);

     CHECK(metric_iter != metric_tracker_.end());

     return metric_iter->second;

   }


   CacheMetricTracker const& getMetricTracker(CacheItemType item_type) const {

     return const_cast<DataRecycler*>(this)->getMetricTracker(item_type);

   }


   std::unordered_set<CacheItemType> const& getCacheItemType() const {

     return cache_item_types_;

   }


   PerTypeCacheItemContainer const& getItemCache() const {

     return cached_items_container_;

   }


  private:

   // internally called under the proper locking scheme

   virtual bool hasItemInCache(

       QueryPlanHash key,

       CacheItemType item_type,

       DeviceIdentifier device_identifier,

       std::lock_guard<std::mutex>& lock,

       std::optional<META_INFO_TYPE> meta_info = std::nullopt) const = 0;


   // internally called under the proper locking scheme

   virtual void removeItemFromCache(

       QueryPlanHash key,

       CacheItemType item_type,

       DeviceIdentifier device_identifier,

       std::lock_guard<std::mutex>& lock,

       std::optional<META_INFO_TYPE> meta_info = std::nullopt) = 0;


   // internally called under the proper locking scheme

   virtual void cleanupCacheForInsertion(

       CacheItemType item_type,

       DeviceIdentifier device_identifier,

       size_t required_size,

       std::lock_guard<std::mutex>& lock,

       std::optional<META_INFO_TYPE> meta_info = std::nullopt) = 0;


   // a set of cache item type that this recycler supports

   std::unordered_set<CacheItemType> cache_item_types_;


   // cache metric tracker

   PerTypeCacheMetricTracker metric_tracker_;


   // per-device cached item containers for each cached item type

   PerTypeCacheItemContainer cached_items_container_;


   mutable std::mutex cache_lock_;

 };

CacheMetricTracker
Definition: DataRecycler.h:183

Analyzer.h
Defines data structures for the semantic analysis phase of query processing.

CachedItem::cached_item
CACHED_ITEM_TYPE cached_item
Definition: DataRecycler.h:437

DataRecycler::getCacheLock
std::mutex & getCacheLock() const
Definition: DataRecycler.h:652

DataRecycler< std::optional< HashType >, EMPTY_META_INFO >::PerTypeCacheItemContainer
std::unordered_map< CacheItemType, std::shared_ptr< PerDeviceCacheItemContainer >> PerTypeCacheItemContainer
Definition: DataRecycler.h:455

DataRecycler
Definition: DataRecycler.h:449

CHECK_EQ
#define CHECK_EQ(x, y)
Definition: Logger.h:301

CacheUpdateAction
CacheUpdateAction
Definition: DataRecycler.h:77

DeviceIdentifier
size_t DeviceIdentifier
Definition: DataRecycler.h:129

DataRecyclerUtil::getDeviceIdentifierString
static std::string getDeviceIdentifierString(DeviceIdentifier device_identifier)
Definition: DataRecycler.h:138

DataRecycler::toString
virtual std::string toString() const =0

CacheMetricTracker::calculateRequiredSpaceForItemAddition
size_t calculateRequiredSpaceForItemAddition(DeviceIdentifier device_identifier, size_t item_size) const
Definition: DataRecycler.h:307

misc.h

CacheMetricTracker::putNewCacheItemMetric
std::shared_ptr< CacheItemMetric > putNewCacheItemMetric(QueryPlanHash key, DeviceIdentifier device_identifier, size_t mem_size, size_t compute_time)
Definition: DataRecycler.h:269

DataRecycler::getCachedItemWithoutConsideringMetaInfo
std::optional< CachedItem< CACHED_ITEM_TYPE, META_INFO_TYPE > > getCachedItemWithoutConsideringMetaInfo(QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier, CachedItemContainer &m, std::lock_guard< std::mutex > &lock)
Definition: DataRecycler.h:543

DataRecycler::getCurrentNumCachedItems
size_t getCurrentNumCachedItems(CacheItemType item_type, DeviceIdentifier device_identifier) const
Definition: DataRecycler.h:565

DataRecycler::getMetricTracker
CacheMetricTracker & getMetricTracker(CacheItemType item_type)
Definition: DataRecycler.h:654

CacheMetricTracker::item_type_
CacheItemType item_type_
Definition: DataRecycler.h:408

CacheSizeMap
std::unordered_map< DeviceIdentifier, size_t > CacheSizeMap
Definition: DataRecycler.h:130

DataRecycler::DataRecycler
DataRecycler(const std::vector< CacheItemType > &item_types, size_t total_cache_size, size_t max_item_size, int num_gpus)
Definition: DataRecycler.h:458

CachedItem::isDirty
bool isDirty() const
Definition: DataRecycler.h:434

LOG
#define LOG(tag)
Definition: Logger.h:285

DataRecycler< std::optional< HashType >, EMPTY_META_INFO >::CachedItemContainer
std::vector< CachedItem< std::optional< HashType >, EMPTY_META_INFO >> CachedItemContainer
Definition: DataRecycler.h:451

Catalog_Namespace::operator<<
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57

DataRecycler::getMetricTracker
CacheMetricTracker const & getMetricTracker(CacheItemType item_type) const
Definition: DataRecycler.h:660

CachedItem::setDirty
void setDirty()
Definition: DataRecycler.h:433

CacheItemMetric::CacheItemMetric
CacheItemMetric(QueryPlanHash query_plan_hash, size_t compute_time, size_t mem_size)
Definition: DataRecycler.h:89

shared::TableKey::table_id
int32_t table_id
Definition: DbObjectKeys.h:70

REMOVE
Definition: DataRecycler.h:77

DataRecycler::cache_item_types_
std::unordered_set< CacheItemType > cache_item_types_
Definition: DataRecycler.h:698

AVAILABLE
Definition: DataRecycler.h:72

gpu_enabled::sort
DEVICE void sort(ARGS &&...args)
Definition: gpu_enabled.h:105

CachedItem::key
QueryPlanHash key
Definition: DataRecycler.h:436

InputDescriptors.h

CacheMetricTracker::getCurrentCacheSize
std::optional< size_t > getCurrentCacheSize(DeviceIdentifier key) const
Definition: DataRecycler.h:260

DataRecycler::getCurrentNumCleanCachedItems
size_t getCurrentNumCleanCachedItems(CacheItemType item_type, DeviceIdentifier device_identifier) const
Definition: DataRecycler.h:581

DataRecycler::getCachedItemContainer
std::shared_ptr< CachedItemContainer > getCachedItemContainer(CacheItemType item_type, DeviceIdentifier device_identifier) const
Definition: DataRecycler.h:528

CacheMetricTracker::setMaxCacheItemSize
void setMaxCacheItemSize(size_t new_max_cache_item_size)
Definition: DataRecycler.h:401

DataRecycler::markCachedItemAsDirtyImpl
void markCachedItemAsDirtyImpl(QueryPlanHash key, CachedItemContainer &m) const
Definition: DataRecycler.h:504

DataRecycler::setMaxCacheItemSize
void setMaxCacheItemSize(CacheItemType item_type, size_t new_max_cache_item_size)
Definition: DataRecycler.h:613

CacheMetricTracker::canAddItem
CacheAvailability canAddItem(DeviceIdentifier device_identifier, size_t item_size) const
Definition: DataRecycler.h:333

RelAlgExecutionDescriptor.h

BBOX_INTERSECT_AUTO_TUNER_PARAM
Definition: DataRecycler.h:44

CacheMetricTracker::clearCacheMetricTracker
void clearCacheMetricTracker()
Definition: DataRecycler.h:317

to_string
std::string to_string(char const *&&v)
Definition: StringTransform.cpp:128

DataRecycler::getItemCache
PerTypeCacheItemContainer const & getItemCache() const
Definition: DataRecycler.h:668

CacheAvailability
CacheAvailability
Definition: DataRecycler.h:71

DataRecycler::metric_tracker_
PerTypeCacheMetricTracker metric_tracker_
Definition: DataRecycler.h:701

DataRecycler::getCurrentCacheSizeForDevice
size_t getCurrentCacheSizeForDevice(CacheItemType item_type, DeviceIdentifier device_identifier) const
Definition: DataRecycler.h:590

NUM_METRIC_TYPE
Definition: DataRecycler.h:84

DataRecycler::cache_lock_
std::mutex cache_lock_
Definition: DataRecycler.h:706

CacheMetricTracker::setCurrentCacheSize
void setCurrentCacheSize(DeviceIdentifier device_identifier, size_t bytes)
Definition: DataRecycler.h:251

CacheMetricTracker::getMaxCacheItemSize
size_t getMaxCacheItemSize() const
Definition: DataRecycler.h:395

BBOX_INTERSECT_HT
Definition: DataRecycler.h:41

logger::INFO
Definition: Logger.h:108

CacheMetricTracker::getCacheItemMetrics
std::vector< std::shared_ptr< CacheItemMetric > > & getCacheItemMetrics(DeviceIdentifier device_identifier)
Definition: DataRecycler.h:236

NUM_CACHE_ITEM_TYPE
Definition: DataRecycler.h:51

CHUNK_METADATA
Definition: DataRecycler.h:46

CacheMetricTracker::removeCacheItemMetric
void removeCacheItemMetric(QueryPlanHash key, DeviceIdentifier device_identifier)
Definition: DataRecycler.h:294

CacheItemType
CacheItemType
Definition: DataRecycler.h:38

CachedItem::meta_info
std::optional< META_INFO_TYPE > meta_info
Definition: DataRecycler.h:439

DataRecycler::initCache
virtual void initCache()=0

CacheMetricTracker::getTotalCacheSize
size_t getTotalCacheSize() const
Definition: DataRecycler.h:394

DataRecycler::isCachedItemDirty
bool isCachedItemDirty(QueryPlanHash key, CachedItemContainer &m) const
Definition: DataRecycler.h:516

CacheMetricTracker::getCacheItemMetric
std::shared_ptr< CacheItemMetric > getCacheItemMetric(QueryPlanHash key, DeviceIdentifier device_identifier) const
Definition: DataRecycler.h:243

CacheMetricTracker::toString
std::string toString() const
Definition: DataRecycler.h:385

DataRecycler::cached_items_container_
PerTypeCacheItemContainer cached_items_container_
Definition: DataRecycler.h:704

CacheMetricTracker::max_cache_item_size_
size_t max_cache_item_size_
Definition: DataRecycler.h:410

CacheMetricTracker::current_cache_size_in_bytes_
CacheSizeMap current_cache_size_in_bytes_
Definition: DataRecycler.h:418

CacheMetricTracker::updateCurrentCacheSize
void updateCurrentCacheSize(DeviceIdentifier device_identifier, CacheUpdateAction action, size_t size)
Definition: DataRecycler.h:354

DataRecycler::putItemToCache
virtual void putItemToCache(QueryPlanHash key, CACHED_ITEM_TYPE item_ptr, CacheItemType item_type, DeviceIdentifier device_identifier, size_t item_size, size_t compute_time, std::optional< META_INFO_TYPE > meta_info=std::nullopt)=0

DataRecycler< std::optional< HashType >, EMPTY_META_INFO >::PerTypeCacheMetricTracker
std::unordered_map< CacheItemType, CacheMetricTracker > PerTypeCacheMetricTracker
Definition: DataRecycler.h:456

DataRecycler::getCachedItemMetric
std::shared_ptr< CacheItemMetric > getCachedItemMetric(CacheItemType item_type, DeviceIdentifier device_identifier, QueryPlanHash key) const
Definition: DataRecycler.h:598

DataRecyclerUtil::getUnitaryTableKey
static QueryPlanHash getUnitaryTableKey()
Definition: DataRecycler.h:145

REF_COUNT
Definition: DataRecycler.h:84

DataRecyclerUtil
Definition: DataRecycler.h:134

ADD
Definition: DataRecycler.h:77

DataRecyclerUtil::getAlternativeTableKeys
static std::unordered_set< size_t > getAlternativeTableKeys(const std::vector< ChunkKey > &chunk_keys, const shared::TableKey &inner_table_key)
Definition: DataRecycler.h:154

toString
std::string toString(const Executor::ExtModuleKinds &kind)
Definition: Execute.h:1703

CacheMetricTracker::setTotalCacheSize
void setTotalCacheSize(size_t new_total_cache_size)
Definition: DataRecycler.h:396

DataRecycler::setTotalCacheSize
void setTotalCacheSize(CacheItemType item_type, size_t new_total_cache_size)
Definition: DataRecycler.h:606

CacheMetricTracker::getCacheItemMetricImpl
static std::shared_ptr< CacheItemMetric > getCacheItemMetricImpl(QueryPlanHash key, CacheMetricInfoMap::mapped_type const &metrics)
Definition: DataRecycler.h:229

BASELINE_HT_APPROX_CARD
Definition: DataRecycler.h:43

DataRecycler::getCurrentNumDirtyCachedItems
size_t getCurrentNumDirtyCachedItems(CacheItemType item_type, DeviceIdentifier device_identifier) const
Definition: DataRecycler.h:572

CachedItem
Definition: DataRecycler.h:422

DataRecycler< std::optional< HashType >, EMPTY_META_INFO >::PerDeviceCacheItemContainer
std::unordered_map< DeviceIdentifier, std::shared_ptr< CachedItemContainer >> PerDeviceCacheItemContainer
Definition: DataRecycler.h:453

CacheMetricType
CacheMetricType
Definition: DataRecycler.h:84

DataRecycler::sortCacheContainerByQueryMetric
void sortCacheContainerByQueryMetric(CacheItemType item_type, DeviceIdentifier device_identifier)
Definition: DataRecycler.h:632

PERFECT_HT
Definition: DataRecycler.h:39

CHECK_LE
#define CHECK_LE(x, y)
Definition: Logger.h:304

CacheMetricTracker::sortCacheInfoByQueryMetric
void sortCacheInfoByQueryMetric(DeviceIdentifier device_identifier)
Definition: DataRecycler.h:368

DataRecycler::getCacheItemType
std::unordered_set< CacheItemType > const & getCacheItemType() const
Definition: DataRecycler.h:664

CacheMetricTracker::CacheMetricTracker
CacheMetricTracker(CacheItemType cache_item_type, size_t total_cache_size, size_t max_cache_item_size, int num_gpus=0)
Definition: DataRecycler.h:185

DataRecycler::~DataRecycler
virtual ~DataRecycler()=default

HashTable.h

QueryPlanHash
size_t QueryPlanHash
Definition: RelAlgExecutionUnit.h:59

UNAVAILABLE
Definition: DataRecycler.h:74

AVAILABLE_AFTER_CLEANUP
Definition: DataRecycler.h:73

HT_HASHING_SCHEME
Definition: DataRecycler.h:42

COMPUTE_TIME
Definition: DataRecycler.h:84

CacheMetricInfoMap
std::unordered_map< DeviceIdentifier, std::vector< std::shared_ptr< CacheItemMetric >>> CacheMetricInfoMap
Definition: DataRecycler.h:132

CacheMetricTracker::getCacheItemMetricItr
static CacheMetricInfoMap::mapped_type::const_iterator getCacheItemMetricItr(QueryPlanHash key, CacheMetricInfoMap::mapped_type const &metrics)
Definition: DataRecycler.h:222

hash_value
std::size_t hash_value(RexAbstractInput const &rex_ab_input)
Definition: RelAlgDag.cpp:3548

CachedItem::CachedItem
CachedItem(QueryPlanHash hashed_plan, CACHED_ITEM_TYPE item, std::shared_ptr< CacheItemMetric > item_metric_ptr, std::optional< META_INFO_TYPE > metadata=std::nullopt)
Definition: DataRecycler.h:423

CacheMetricTracker::total_cache_size_
size_t total_cache_size_
Definition: DataRecycler.h:409

DataRecycler::hasItemInCache
virtual bool hasItemInCache(QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier, std::lock_guard< std::mutex > &lock, std::optional< META_INFO_TYPE > meta_info=std::nullopt) const =0

false
bool g_enable_watchdog false
Definition: Execute.cpp:80

CacheMetricTracker::cache_metrics_
CacheMetricInfoMap cache_metrics_
Definition: DataRecycler.h:415

CHECK
#define CHECK(condition)
Definition: Logger.h:291

DataRecycler::clearCache
virtual void clearCache()=0

DataRecycler::removeItemFromCache
virtual void removeItemFromCache(QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier, std::lock_guard< std::mutex > &lock, std::optional< META_INFO_TYPE > meta_info=std::nullopt)=0

ResultSet.h
Basic constructors and methods of the row set interface.

DataRecycler::getItemFromCache
virtual CACHED_ITEM_TYPE getItemFromCache(QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier, std::optional< META_INFO_TYPE > meta_info=std::nullopt)=0

ColumnarResults.h

run_benchmark_import.action
string action
Definition: run_benchmark_import.py:65

MEM_SIZE
Definition: DataRecycler.h:84

DataRecycler::markCachedItemAsDirty
virtual void markCachedItemAsDirty(size_t table_key, std::unordered_set< QueryPlanHash > &key_set, CacheItemType item_type, DeviceIdentifier device_identifier)=0

CacheMetricTracker::removeMetricFromBeginning
void removeMetricFromBeginning(DeviceIdentifier device_identifier, int offset)
Definition: DataRecycler.h:302

RelAlgExecutionUnit.h
Execution unit for relational algebra. It&#39;s a low-level description of any relational algebra operati...

CachedItem::dirty
bool dirty
Definition: DataRecycler.h:440

EMPTY_META_INFO
Definition: DataRecycler.h:35

DataRecyclerUtil::CPU_DEVICE_IDENTIFIER
static constexpr DeviceIdentifier CPU_DEVICE_IDENTIFIER
Definition: DataRecycler.h:136

shared::TableKey
Definition: DbObjectKeys.h:51

QUERY_RESULTSET
Definition: DataRecycler.h:45

DataRecycler::removeCachedItemFromBeginning
void removeCachedItemFromBeginning(CacheItemType item_type, DeviceIdentifier device_identifier, int offset)
Definition: DataRecycler.h:621

DataRecycler::cleanupCacheForInsertion
virtual void cleanupCacheForInsertion(CacheItemType item_type, DeviceIdentifier device_identifier, size_t required_size, std::lock_guard< std::mutex > &lock, std::optional< META_INFO_TYPE > meta_info=std::nullopt)=0

CachedItem::item_metric
std::shared_ptr< CacheItemMetric > item_metric
Definition: DataRecycler.h:438

CacheItemMetric
Definition: DataRecycler.h:87

VLOG
#define VLOG(n)
Definition: Logger.h:388

heavyai_shared_mutex.h

BASELINE_HT
Definition: DataRecycler.h:40

CacheItemMetric::metrics_
std::array< size_t, CacheMetricType::NUM_METRIC_TYPE > metrics_
Definition: DataRecycler.h:90

shared::TableKey::hash
size_t hash() const
Definition: DbObjectKeys.cpp:75