OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CachingFileMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
30 #pragma once
31 
32 #include <optional>
33 #include <ostream>
34 
36 #include "FileMgr.h"
37 #include "Shared/File.h"
38 #include "Shared/SysDefinitions.h"
39 
40 namespace File_Namespace {
41 
42 enum class DiskCacheLevel { none, fsi, non_fsi, all };
43 
44 std::ostream& operator<<(std::ostream& os, DiskCacheLevel disk_cache_level);
45 
47  // Note the suffix UL is not portable. 'long' i a different size
48  // on nix vs windows. Hense the explicit cast to long.
49  static constexpr size_t DEFAULT_MAX_SIZE{(size_t)1024 * (size_t)1024 * (size_t)1024 *
50  (size_t)100}; // 100G default
51  std::string path;
53  size_t num_reader_threads = 0;
57  inline bool isEnabledForMutableTables() const {
60  }
61  inline bool isEnabledForFSI() const {
63  }
64  inline bool isEnabled() const { return enabled_level != DiskCacheLevel::none; }
65  std::string dump() const {
66  std::stringstream ss;
67  ss << "DiskCacheConfig(path = " << path << ", level = " << levelAsString()
68  << ", threads = " << num_reader_threads << ", size limit = " << size_limit
69  << ", page size = " << page_size << ")";
70  return ss.str();
71  }
72  std::string levelAsString() const {
73  switch (enabled_level) {
75  return "none";
77  return "fsi";
79  return "non_fsi";
81  return "all";
82  }
83  return "";
84  }
85  static std::string getDefaultPath(const std::string& base_path) {
86  return base_path + "/" + shared::kDefaultDiskCacheDirName;
87  }
88 };
89 
90 inline std::string get_dir_name_for_table(int db_id, int tb_id) {
91  std::stringstream file_name;
92  file_name << "table_" << db_id << "_" << tb_id << "/";
93  return file_name.str();
94 }
95 
96 // Class to control access to the table-specific directories and data created inside a
97 // CachingFileMgr.
98 class TableFileMgr {
99  public:
100  TableFileMgr(const std::string& table_path);
102 
106  void incrementEpoch();
107 
112 
116  int32_t getEpoch() const;
117 
121  void removeDiskContent() const;
122 
126  size_t getReservedSpace() const;
127 
131  void deleteWrapperFile() const;
132 
136  void writeWrapperFile(const std::string& doc) const;
137 
141  bool hasWrapperFile() const;
142 
143  private:
144  std::string table_path_;
145  std::string epoch_file_path_;
146  std::string wrapper_file_path_;
148  bool is_checkpointed_ = true;
149  FILE* epoch_file_ = nullptr;
150 
152 };
153 
154 // Extension of FileBuffer with restricted behaviour.
156  public:
158  // The cache can only be appended to, not written, as it lets us maintain a single
159  // version of the data. This override is to make sure we don't accidentally start
160  // writing to cache buffers.
161  void write(int8_t* src,
162  const size_t numBytes,
163  const size_t offset = 0,
164  const MemoryLevel srcMemoryLevel = CPU_LEVEL,
165  const int32_t deviceId = -1) override {
166  UNREACHABLE() << "Cache buffers support append(), but not write()";
167  }
168 };
169 
178 class CachingFileMgr : public FileMgr {
179  public:
180  static constexpr char WRAPPER_FILE_NAME[] = "wrapper_metadata.json";
181  // We currently assign %10 of the cache to data wrapper space arbitrarily.
182  // static constexpr size_t WRAPPER_SPACE_RATIO{10};
183  // Portion of the CFM space reserved for metadata (metadata files and data wrappers)
184  static constexpr float METADATA_SPACE_PERCENTAGE{0.1};
185  // Portion of the CFM metadata space reserved for metadata files (subset of above).
186  static constexpr float METADATA_FILE_SPACE_PERCENTAGE{0.01};
187 
188  static size_t getMinimumSize() {
189  // Currently the minimum default size is based on the metadata file size and
190  // percentage usage.
193  }
194 
195  CachingFileMgr(const DiskCacheConfig& config);
196 
197  ~CachingFileMgr() override;
198 
199  // Simple getters.
200  inline MgrType getMgrType() override { return CACHING_FILE_MGR; };
201  inline std::string getStringMgrType() override { return ToString(CACHING_FILE_MGR); }
202  inline size_t getPageSize() { return page_size_; }
203  inline size_t getMaxSize() override { return max_size_; }
204  inline size_t getMaxDataFiles() const { return max_num_data_files_; }
205  inline size_t getMaxMetaFiles() const { return max_num_meta_files_; }
206  inline size_t getMaxWrapperSize() const { return max_wrapper_space_; }
207  inline size_t getDataFileSize() const { return page_size_ * num_pages_per_data_file_; }
208  inline size_t getMetadataFileSize() const {
210  }
211 
212  size_t getNumDataFiles() const;
213  size_t getNumMetaFiles() const;
214  inline size_t getAvailableSpace() { return max_size_ - getAllocated(); }
215  inline size_t getAvailableWrapperSpace() {
217  }
218  inline size_t getAllocated() override {
219  return getFilesSize() + getTableFileMgrsSize();
220  }
221  size_t getMaxDataFilesSize() const;
222 
226  void removeChunkKeepMetadata(const ChunkKey& key);
227 
231  void clearForTable(int32_t db_id, int32_t tb_id);
232 
237  inline bool hasFileMgrKey() const override { return false; }
238 
242  void closeRemovePhysical() override;
243 
247  size_t getChunkSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
248  size_t getMetadataSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
249  size_t getTableFileMgrSpaceReserved(int32_t db_id, int32_t tb_id) const;
250  size_t getSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
251 
255  std::string describeSelf() const override;
256 
261  void checkpoint(const int32_t db_id, const int32_t tb_id) override;
262 
266  int32_t epoch(int32_t db_id, int32_t tb_id) const override;
267 
271  FileBuffer* putBuffer(const ChunkKey& key,
272  AbstractBuffer* srcBuffer,
273  const size_t numBytes = 0) override;
278  CachingFileBuffer* allocateBuffer(const size_t page_size,
279  const ChunkKey& key,
280  const size_t num_bytes = 0) override;
282  const ChunkKey& key,
283  const std::vector<HeaderInfo>::const_iterator& headerStartIt,
284  const std::vector<HeaderInfo>::const_iterator& headerEndIt) override;
285 
289  bool updatePageIfDeleted(FileInfo* file_info,
290  ChunkKey& chunk_key,
291  int32_t contingent,
292  int32_t page_epoch,
293  int32_t page_num) override;
294 
298  inline bool failOnReadError() const override { return false; }
299 
303  void deleteBufferIfExists(const ChunkKey& key);
304 
309  size_t getNumChunksWithMetadata() const;
310 
314  size_t getNumDataChunks() const;
315 
319  std::vector<ChunkKey> getChunkKeysForPrefix(const ChunkKey& prefix) const;
320 
324  std::unique_ptr<CachingFileMgr> reconstruct() const;
325 
329  void deleteWrapperFile(int32_t db, int32_t tb);
330 
334  void writeWrapperFile(const std::string& doc, int32_t db, int32_t tb);
335 
339  bool hasWrapperFile(int32_t db_id, int32_t table_id) const;
340 
341  std::string getTableFileMgrPath(int32_t db, int32_t tb) const;
342 
347  size_t getFilesSize() const;
348 
353  size_t getTableFileMgrsSize() const;
354 
358  std::optional<FileBuffer*> getBufferIfExists(const ChunkKey& key);
359 
364  void free_page(std::pair<FileInfo*, int32_t>&& page) override;
365 
367  const ChunkKey& keyPrefix) override;
368 
369  // Useful for debugging.
370  std::string dumpKeysWithMetadata() const;
371  std::string dumpKeysWithChunkData() const;
372  std::string dumpTableQueue() const { return table_evict_alg_.dumpEvictionQueue(); }
373  std::string dumpEvictionQueue() const { return chunk_evict_alg_.dumpEvictionQueue(); }
374  std::string dump() const;
375 
376  // Used for unit testing
377  void setMaxNumDataFiles(size_t max) { max_num_data_files_ = max; }
378  void setMaxNumMetadataFiles(size_t max) { max_num_meta_files_ = max; }
379  void setMaxWrapperSpace(size_t max) { max_wrapper_space_ = max; }
380  std::set<ChunkKey> getKeysWithMetadata() const;
381  void setDataSizeLimit(size_t max) { limit_data_size_ = max; }
382 
387  Page requestFreePage(size_t pagesize, const bool isMetadata) override;
388 
389  private:
393  void incrementEpoch(int32_t db_id, int32_t tb_id);
394 
399  void init(const size_t num_reader_threads);
400 
404  void writeAndSyncEpochToDisk(int32_t db_id, int32_t tb_id);
405 
410  void readTableFileMgrs();
411 
416  const ChunkKey& key,
417  const std::vector<HeaderInfo>::const_iterator& startIt,
418  const std::vector<HeaderInfo>::const_iterator& endIt) override;
419 
424  size_t pageSize = 0,
425  const size_t numBytes = 0) override;
426 
430  void createTableFileMgrIfNoneExists(const int32_t db_id, const int32_t tb_id);
431 
435  void incrementAllEpochs();
436 
440  void removeTableFileMgr(int32_t db_id, int32_t tb_id);
441 
445  void removeTableBuffers(int32_t db_id, int32_t tb_id);
446 
450  void writeDirtyBuffers(int32_t db_id, int32_t tb_id);
451 
455  void touchKey(const ChunkKey& key) const;
456  void removeKey(const ChunkKey& key) const;
457 
462  std::vector<ChunkKey> getKeysForTable(int32_t db_id, int32_t tb_id) const;
463 
470 
476  FileInfo* evictPages();
477 
483  void deleteCacheIfTooLarge();
484 
489  void setMaxSizes();
490 
492  const size_t numBytes = 0) const override;
493 
494  ChunkKeyToChunkMap::iterator deleteBufferUnlocked(
495  const ChunkKeyToChunkMap::iterator chunk_it,
496  const bool purge = true) override;
497 
498  // CachingFileMgr is allowed to write in read-only mode, so override this function.
499  void readOnlyCheck(const std::string& action,
500  const std::optional<std::string>& file_name = {}) const override{};
501 
502  mutable heavyai::shared_mutex table_dirs_mutex_; // mutex for table_dirs_.
503  // each table gest a separate epoch. Uses pointers for move semantics.
504  std::map<TablePair, std::unique_ptr<TableFileMgr>> table_dirs_;
505 
506  size_t max_num_data_files_; // set based on max_size_.
507  size_t max_num_meta_files_; // set based on max_size_.
508  size_t max_wrapper_space_; // set based on max_size_.
509  size_t max_size_;
510  std::optional<size_t> limit_data_size_{}; // Used for testing artifically small caches.
511 
512  mutable LRUEvictionAlgorithm chunk_evict_alg_; // last chunk touched.
513  mutable LRUEvictionAlgorithm table_evict_alg_; // last table touched.
514 };
515 
516 } // namespace File_Namespace
const size_t metadata_page_size_
Definition: FileMgr.h:552
size_t getTableFileMgrSpaceReserved(int32_t db_id, int32_t tb_id) const
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunkMetadataVec, const ChunkKey &keyPrefix) override
std::vector< int > ChunkKey
Definition: types.h:36
std::ostream & operator<<(std::ostream &os, DiskCacheLevel disk_cache_level)
LRUEvictionAlgorithm table_evict_alg_
std::string dumpTableQueue() const
void removeDiskContent() const
Removes all disk data for the subdir.
std::string getStringMgrType() override
static constexpr size_t DEFAULT_NUM_PAGES_PER_METADATA_FILE
Definition: FileMgr.h:385
const size_t page_size_
Definition: FileMgr.h:551
static constexpr float METADATA_SPACE_PERCENTAGE
heavyai::shared_mutex table_dirs_mutex_
const std::string kDefaultDiskCacheDirName
std::string get_dir_name_for_table(int db_id, int tb_id)
void writeWrapperFile(const std::string &doc, int32_t db, int32_t tb)
Writes a wrapper file to a table subdir.
A logical page (Page) belongs to a file on disk.
Definition: Page.h:46
size_t getFilesSize() const
Get the total size of page files (data and metadata files). This includes allocated, but unused space.
std::vector< ChunkKey > getChunkKeysForPrefix(const ChunkKey &prefix) const
Returns the keys for chunks with chunk data that match the given prefix.
void setMaxSizes()
Sets the maximum number of files/space for each type of storage based on the maximum size...
void writeAndSyncEpochToDisk()
Write and flush the epoch to the epoch file on disk.
#define UNREACHABLE()
Definition: Logger.h:338
This file includes the class specification for the FILE manager (FileMgr), and related data structure...
std::string describeSelf() const override
describes this FileMgr for logging purposes.
size_t getSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
void closeRemovePhysical() override
Closes files and removes the caching directory.
void touchKey(const ChunkKey &key) const
Used to track which tables/chunks were least recently used.
#define DEFAULT_METADATA_PAGE_SIZE
size_t getMetadataSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
void createTableFileMgrIfNoneExists(const int32_t db_id, const int32_t tb_id)
Create and initialize a subdirectory for a table if none exists.
Represents/provides access to contiguous data stored in the file system.
Definition: FileBuffer.h:57
void checkpoint() override
Fsyncs data files, writes out epoch and fsyncs that.
Definition: FileMgr.cpp:706
Page requestFreePage(size_t pagesize, const bool isMetadata) override
requests a free page similar to FileMgr, but this override will also evict existing pages to make spa...
void deleteWrapperFile(int32_t db, int32_t tb)
Deletes the wrapper file from a table subdir.
ChunkKeyToChunkMap::iterator deleteBufferUnlocked(const ChunkKeyToChunkMap::iterator chunk_it, const bool purge=true) override
CachingFileMgr(const DiskCacheConfig &config)
static size_t num_pages_per_data_file_
Definition: FileMgr.h:427
std::optional< FileBuffer * > getBufferIfExists(const ChunkKey &key)
an optional version of get buffer if we are not sure a chunk exists.
std::set< ChunkKey > getKeysWithMetadata() const
bool failOnReadError() const override
True if a read error should cause a fatal error.
FileInfo * evictPages()
evicts all data pages for the least recently used Chunk (metadata pages persist). Returns the first F...
FileBuffer * createBufferUnlocked(const ChunkKey &key, size_t pageSize=0, const size_t numBytes=0) override
Creates a buffer.
void deleteCacheIfTooLarge()
When the cache is read from disk, we don&#39;t know which chunks were least recently used. Rather than try to evict random pages to get down to size we just reset the cache to make sure we have space.
heavyai::shared_mutex table_mutex_
void incrementAllEpochs()
Increment epochs for each table in the CFM.
CachingFileBuffer * allocateBuffer(const size_t page_size, const ChunkKey &key, const size_t num_bytes=0) override
allocates a new CachingFileBuffer and tracks it&#39;s use in the eviction algorithms. ...
int32_t incrementEpoch()
Definition: FileMgr.h:285
size_t getNumDataChunks() const
Returns the number of buffers with chunk data in the CFM.
bool hasWrapperFile(int32_t db_id, int32_t table_id) const
std::unique_ptr< CachingFileMgr > reconstruct() const
Initializes a new CFM using the initialization values in the current CFM.
size_t getTableFileMgrsSize() const
Returns the total size of all subdirectory files. Each table represented in the CFM has a subdirector...
void removeTableBuffers(int32_t db_id, int32_t tb_id)
Erases and cleans up all buffers for a table.
TableFileMgr(const std::string &table_path)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
static constexpr size_t DEFAULT_MAX_SIZE
bool updatePageIfDeleted(FileInfo *file_info, ChunkKey &chunk_key, int32_t contingent, int32_t page_epoch, int32_t page_num) override
checks whether a page should be deleted.
void deleteWrapperFile() const
Deletes only the wrapper file on disk.
An AbstractBuffer is a unit of data management for a data manager.
void incrementEpoch()
increment the epoch for this subdir (not synced to disk).
void setMaxNumDataFiles(size_t max)
void deleteBufferIfExists(const ChunkKey &key)
deletes a buffer if it exists in the mgr. Otherwise do nothing.
static size_t num_pages_per_metadata_file_
Definition: FileMgr.h:428
void writeAndSyncEpochToDisk()
Definition: FileMgr.cpp:659
void removeTableFileMgr(int32_t db_id, int32_t tb_id)
Removes the subdirectory content for a table.
std::optional< size_t > limit_data_size_
#define DEFAULT_PAGE_SIZE
void setMaxNumMetadataFiles(size_t max)
void removeChunkKeepMetadata(const ChunkKey &key)
Free pages for chunk and remove it from the chunk eviction algorithm.
void writeWrapperFile(const std::string &doc) const
Writes wrapper file to disk.
bool hasFileMgrKey() const override
Query to determine if the contained pages will have their database and table ids overriden by the fil...
std::string dumpKeysWithChunkData() const
Definition: Epoch.h:30
void write(int8_t *src, const size_t numBytes, const size_t offset=0, const MemoryLevel srcMemoryLevel=CPU_LEVEL, const int32_t deviceId=-1) override
static constexpr char WRAPPER_FILE_NAME[]
size_t getReservedSpace() const
Returns the disk space used (in bytes) for the subdir.
std::map< TablePair, std::unique_ptr< TableFileMgr > > table_dirs_
std::string dumpKeysWithMetadata() const
std::vector< ChunkKey > getKeysForTable(int32_t db_id, int32_t tb_id) const
returns set of keys contained in chunkIndex_ that match the given table prefix.
void readTableFileMgrs()
Checks for any sub-directories containing table-specific data and creates epochs from found files...
FileInfo * evictMetadataPages()
evicts all metadata pages for the least recently used table. Returns the first FileInfo that a page w...
FileBuffer(FileMgr *fm, const size_t pageSize, const ChunkKey &chunkKey, const size_t initialSize=0)
Constructs a FileBuffer object.
Definition: FileBuffer.cpp:37
int32_t epoch() const
Definition: FileMgr.h:530
void close(FILE *f)
Closes the file pointed to by the FILE pointer.
Definition: File.cpp:114
void readOnlyCheck(const std::string &action, const std::optional< std::string > &file_name={}) const override
void init(const size_t num_reader_threads)
Initializes a CFM, parsing any existing files and initializing data structures appropriately (current...
static std::string getDefaultPath(const std::string &base_path)
std::shared_timed_mutex shared_mutex
void free_page(std::pair< FileInfo *, int32_t > &&page) override
Unlike the FileMgr, the CFM frees pages immediately instead of holding them until the next checkpoint...
size_t getNumChunksWithMetadata() const
Returns the number of buffers with metadata in the CFM. Any buffer with an encoder counts...
size_t getChunkSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
FileBuffer * getBufferUnlocked(const ChunkKey &key, const size_t numBytes=0) const override
static constexpr float METADATA_FILE_SPACE_PERCENTAGE
int32_t getEpoch() const
Returns the current epoch (locked)
std::string getTableFileMgrPath(int32_t db, int32_t tb) const
A selection of helper methods for File I/O.
void clearForTable(int32_t db_id, int32_t tb_id)
Removes all data related to the given table (pages and subdirectories).
void removeKey(const ChunkKey &key) const
This file includes the class specification for the Least Recently Used cache eviction algorithm used ...
A FileMgr capable of limiting it&#39;s size and storing data from multiple tables in a shared directory...
void setMaxWrapperSpace(size_t max)
LRUEvictionAlgorithm chunk_evict_alg_
std::string levelAsString() const
FileBuffer * putBuffer(const ChunkKey &key, AbstractBuffer *srcBuffer, const size_t numBytes=0) override
deletes any existing buffer for the given key then copies in a new one.
std::string dumpEvictionQueue() const
FileBuffer * createBufferFromHeaders(const ChunkKey &key, const std::vector< HeaderInfo >::const_iterator &startIt, const std::vector< HeaderInfo >::const_iterator &endIt) override
Creates a buffer and initializes it with info read from files on disk.