OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CachingFileMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
30 #pragma once
31 
32 #include <optional>
34 #include "FileMgr.h"
35 #include "Shared/File.h"
36 #include "Shared/SysDefinitions.h"
37 
38 namespace File_Namespace {
39 
40 enum class DiskCacheLevel { none, fsi, non_fsi, all };
42  // Note the suffix UL is not portable. 'long' i a different size
43  // on nix vs windows. Hense the explicit cast to long.
44  static constexpr size_t DEFAULT_MAX_SIZE{(size_t)1024 * (size_t)1024 * (size_t)1024 *
45  (size_t)100}; // 100G default
46  std::string path;
48  size_t num_reader_threads = 0;
51  inline bool isEnabledForMutableTables() const {
54  }
55  inline bool isEnabledForFSI() const {
57  }
58  inline bool isEnabled() const { return enabled_level != DiskCacheLevel::none; }
59  std::string dump() const {
60  std::stringstream ss;
61  ss << "DiskCacheConfig(path = " << path << ", level = " << levelAsString()
62  << ", threads = " << num_reader_threads << ", size limit = " << size_limit
63  << ", page size = " << page_size << ")";
64  return ss.str();
65  }
66  std::string levelAsString() const {
67  switch (enabled_level) {
69  return "none";
71  return "fsi";
73  return "non_fsi";
75  return "all";
76  }
77  return "";
78  }
79  static std::string getDefaultPath(const std::string& base_path) {
80  return base_path + "/" + shared::kDefaultDiskCacheDirName;
81  }
82 };
83 
84 inline std::string get_dir_name_for_table(int db_id, int tb_id) {
85  std::stringstream file_name;
86  file_name << "table_" << db_id << "_" << tb_id << "/";
87  return file_name.str();
88 }
89 
90 // Class to control access to the table-specific directories and data created inside a
91 // CachingFileMgr.
92 class TableFileMgr {
93  public:
94  TableFileMgr(const std::string& table_path);
96 
100  void incrementEpoch();
101 
106 
110  int32_t getEpoch() const;
111 
115  void removeDiskContent() const;
116 
120  size_t getReservedSpace() const;
121 
125  void deleteWrapperFile() const;
126 
130  void writeWrapperFile(const std::string& doc) const;
131 
135  bool hasWrapperFile() const;
136 
137  private:
138  std::string table_path_;
139  std::string epoch_file_path_;
140  std::string wrapper_file_path_;
142  bool is_checkpointed_ = true;
143  FILE* epoch_file_ = nullptr;
144 
146 };
147 
148 // Extension of FileBuffer with restricted behaviour.
150  public:
152  // The cache can only be appended to, not written, as it lets us maintain a single
153  // version of the data. This override is to make sure we don't accidentally start
154  // writing to cache buffers.
155  void write(int8_t* src,
156  const size_t numBytes,
157  const size_t offset = 0,
158  const MemoryLevel srcMemoryLevel = CPU_LEVEL,
159  const int32_t deviceId = -1) override {
160  UNREACHABLE() << "Cache buffers support append(), but not write()";
161  }
162 };
163 
172 class CachingFileMgr : public FileMgr {
173  public:
174  static constexpr char WRAPPER_FILE_NAME[] = "wrapper_metadata.json";
175  // We currently assign %10 of the cache to data wrapper space arbitrarily.
176  // static constexpr size_t WRAPPER_SPACE_RATIO{10};
177  // Portion of the CFM space reserved for metadata (metadata files and data wrappers)
178  static constexpr float METADATA_SPACE_PERCENTAGE{0.1};
179  // Portion of the CFM metadata space reserved for metadata files (subset of above).
180  static constexpr float METADATA_FILE_SPACE_PERCENTAGE{0.01};
181 
182  static size_t getMinimumSize() {
183  // Currently the minimum default size is based on the metadata file size and
184  // percentage usage.
187  }
188 
189  CachingFileMgr(const DiskCacheConfig& config);
190 
191  ~CachingFileMgr() override;
192 
193  // Simple getters.
194  inline MgrType getMgrType() override { return CACHING_FILE_MGR; };
195  inline std::string getStringMgrType() override { return ToString(CACHING_FILE_MGR); }
196  inline size_t getDefaultPageSize() { return defaultPageSize_; }
197  inline size_t getMaxSize() override { return max_size_; }
198  inline size_t getMaxDataFiles() const { return max_num_data_files_; }
199  inline size_t getMaxMetaFiles() const { return max_num_meta_files_; }
200  inline size_t getMaxWrapperSize() const { return max_wrapper_space_; }
201  inline size_t getDataFileSize() const {
203  }
204  inline size_t getMetadataFileSize() const {
206  }
207 
208  size_t getNumDataFiles() const;
209  size_t getNumMetaFiles() const;
210  inline size_t getAvailableSpace() { return max_size_ - getAllocated(); }
211  inline size_t getAvailableWrapperSpace() {
213  }
214  inline size_t getAllocated() override {
215  return getFilesSize() + getTableFileMgrsSize();
216  }
217  size_t getMaxDataFilesSize() const;
218 
222  void removeChunkKeepMetadata(const ChunkKey& key);
223 
227  void clearForTable(int32_t db_id, int32_t tb_id);
228 
233  inline bool hasFileMgrKey() const override { return false; }
234 
238  void closeRemovePhysical() override;
239 
243  size_t getChunkSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
244  size_t getMetadataSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
245  size_t getTableFileMgrSpaceReserved(int32_t db_id, int32_t tb_id) const;
246  size_t getSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
247 
251  std::string describeSelf() const override;
252 
257  void checkpoint(const int32_t db_id, const int32_t tb_id) override;
258 
262  int32_t epoch(int32_t db_id, int32_t tb_id) const override;
263 
267  FileBuffer* putBuffer(const ChunkKey& key,
268  AbstractBuffer* srcBuffer,
269  const size_t numBytes = 0) override;
274  CachingFileBuffer* allocateBuffer(const size_t page_size,
275  const ChunkKey& key,
276  const size_t num_bytes = 0) override;
278  const ChunkKey& key,
279  const std::vector<HeaderInfo>::const_iterator& headerStartIt,
280  const std::vector<HeaderInfo>::const_iterator& headerEndIt) override;
281 
285  bool updatePageIfDeleted(FileInfo* file_info,
286  ChunkKey& chunk_key,
287  int32_t contingent,
288  int32_t page_epoch,
289  int32_t page_num) override;
290 
294  inline bool failOnReadError() const override { return false; }
295 
299  void deleteBufferIfExists(const ChunkKey& key);
300 
305  size_t getNumChunksWithMetadata() const;
306 
310  size_t getNumDataChunks() const;
311 
315  std::vector<ChunkKey> getChunkKeysForPrefix(const ChunkKey& prefix) const;
316 
320  std::unique_ptr<CachingFileMgr> reconstruct() const;
321 
325  void deleteWrapperFile(int32_t db, int32_t tb);
326 
330  void writeWrapperFile(const std::string& doc, int32_t db, int32_t tb);
331 
335  bool hasWrapperFile(int32_t db_id, int32_t table_id) const;
336 
337  std::string getTableFileMgrPath(int32_t db, int32_t tb) const;
338 
343  size_t getFilesSize() const;
344 
349  size_t getTableFileMgrsSize() const;
350 
354  std::optional<FileBuffer*> getBufferIfExists(const ChunkKey& key);
355 
360  void free_page(std::pair<FileInfo*, int32_t>&& page) override;
361 
363  const ChunkKey& keyPrefix) override;
364 
365  // Useful for debugging.
366  std::string dumpKeysWithMetadata() const;
367  std::string dumpKeysWithChunkData() const;
368  std::string dumpTableQueue() const { return table_evict_alg_.dumpEvictionQueue(); }
369  std::string dumpEvictionQueue() const { return chunk_evict_alg_.dumpEvictionQueue(); }
370  std::string dump() const;
371 
372  // Used for unit testing
373  void setMaxNumDataFiles(size_t max) { max_num_data_files_ = max; }
374  void setMaxNumMetadataFiles(size_t max) { max_num_meta_files_ = max; }
375  void setMaxWrapperSpace(size_t max) { max_wrapper_space_ = max; }
376  std::set<ChunkKey> getKeysWithMetadata() const;
377  void setDataSizeLimit(size_t max) { limit_data_size_ = max; }
378 
379  private:
383  void incrementEpoch(int32_t db_id, int32_t tb_id);
384 
389  void init(const size_t num_reader_threads);
390 
394  void writeAndSyncEpochToDisk(int32_t db_id, int32_t tb_id);
395 
400  void readTableFileMgrs();
401 
406  const ChunkKey& key,
407  const std::vector<HeaderInfo>::const_iterator& startIt,
408  const std::vector<HeaderInfo>::const_iterator& endIt) override;
409 
414  size_t pageSize = 0,
415  const size_t numBytes = 0) override;
416 
420  void createTableFileMgrIfNoneExists(const int32_t db_id, const int32_t tb_id);
421 
425  void incrementAllEpochs();
426 
430  void removeTableFileMgr(int32_t db_id, int32_t tb_id);
431 
435  void removeTableBuffers(int32_t db_id, int32_t tb_id);
436 
440  void writeDirtyBuffers(int32_t db_id, int32_t tb_id);
441 
446  Page requestFreePage(size_t pagesize, const bool isMetadata) override;
447 
451  void touchKey(const ChunkKey& key) const;
452  void removeKey(const ChunkKey& key) const;
453 
458  std::vector<ChunkKey> getKeysForTable(int32_t db_id, int32_t tb_id) const;
459 
466 
472  FileInfo* evictPages();
473 
479  void deleteCacheIfTooLarge();
480 
485  void setMaxSizes();
486 
488  const size_t numBytes = 0) const override;
489  ChunkKeyToChunkMap::iterator deleteBufferUnlocked(
490  const ChunkKeyToChunkMap::iterator chunk_it,
491  const bool purge = true) override;
492 
493  mutable heavyai::shared_mutex table_dirs_mutex_; // mutex for table_dirs_.
494  // each table gest a separate epoch. Uses pointers for move semantics.
495  std::map<TablePair, std::unique_ptr<TableFileMgr>> table_dirs_;
496 
497  size_t max_num_data_files_; // set based on max_size_.
498  size_t max_num_meta_files_; // set based on max_size_.
499  size_t max_wrapper_space_; // set based on max_size_.
500  size_t max_size_;
501  std::optional<size_t> limit_data_size_{}; // Used for testing artifically small caches.
502 
503  mutable LRUEvictionAlgorithm chunk_evict_alg_; // last chunk touched.
504  mutable LRUEvictionAlgorithm table_evict_alg_; // last table touched.
505 };
506 
507 } // namespace File_Namespace
size_t getTableFileMgrSpaceReserved(int32_t db_id, int32_t tb_id) const
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunkMetadataVec, const ChunkKey &keyPrefix) override
#define METADATA_PAGE_SIZE
Definition: FileBuffer.h:37
std::vector< int > ChunkKey
Definition: types.h:36
LRUEvictionAlgorithm table_evict_alg_
std::string dumpTableQueue() const
void removeDiskContent() const
Removes all disk data for the subdir.
std::string getStringMgrType() override
static constexpr size_t DEFAULT_NUM_PAGES_PER_METADATA_FILE
Definition: FileMgr.h:371
static constexpr float METADATA_SPACE_PERCENTAGE
heavyai::shared_mutex table_dirs_mutex_
const std::string kDefaultDiskCacheDirName
std::string get_dir_name_for_table(int db_id, int tb_id)
void writeWrapperFile(const std::string &doc, int32_t db, int32_t tb)
Writes a wrapper file to a table subdir.
A logical page (Page) belongs to a file on disk.
Definition: Page.h:46
size_t getFilesSize() const
Get the total size of page files (data and metadata files). This includes allocated, but unused space.
std::vector< ChunkKey > getChunkKeysForPrefix(const ChunkKey &prefix) const
Returns the keys for chunks with chunk data that match the given prefix.
void setMaxSizes()
Sets the maximum number of files/space for each type of storage based on the maximum size...
void writeAndSyncEpochToDisk()
Write and flush the epoch to the epoch file on disk.
#define UNREACHABLE()
Definition: Logger.h:266
This file includes the class specification for the FILE manager (FileMgr), and related data structure...
std::string describeSelf() const override
describes this FileMgr for logging purposes.
size_t getSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
void closeRemovePhysical() override
Closes files and removes the caching directory.
void touchKey(const ChunkKey &key) const
Used to track which tables/chunks were least recently used.
size_t getMetadataSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
void createTableFileMgrIfNoneExists(const int32_t db_id, const int32_t tb_id)
Create and initialize a subdirectory for a table if none exists.
Represents/provides access to contiguous data stored in the file system.
Definition: FileBuffer.h:58
void checkpoint() override
Fsyncs data files, writes out epoch and fsyncs that.
Definition: FileMgr.cpp:694
Page requestFreePage(size_t pagesize, const bool isMetadata) override
requests a free page similar to FileMgr, but this override will also evict existing pages to make spa...
void deleteWrapperFile(int32_t db, int32_t tb)
Deletes the wrapper file from a table subdir.
ChunkKeyToChunkMap::iterator deleteBufferUnlocked(const ChunkKeyToChunkMap::iterator chunk_it, const bool purge=true) override
CachingFileMgr(const DiskCacheConfig &config)
static size_t num_pages_per_data_file_
Definition: FileMgr.h:417
std::optional< FileBuffer * > getBufferIfExists(const ChunkKey &key)
an optional version of get buffer if we are not sure a chunk exists.
std::set< ChunkKey > getKeysWithMetadata() const
bool failOnReadError() const override
True if a read error should cause a fatal error.
FileInfo * evictPages()
evicts all data pages for the least recently used Chunk (metadata pages persist). Returns the first F...
FileBuffer * createBufferUnlocked(const ChunkKey &key, size_t pageSize=0, const size_t numBytes=0) override
Creates a buffer.
void deleteCacheIfTooLarge()
When the cache is read from disk, we don&#39;t know which chunks were least recently used. Rather than try to evict random pages to get down to size we just reset the cache to make sure we have space.
heavyai::shared_mutex table_mutex_
void incrementAllEpochs()
Increment epochs for each table in the CFM.
CachingFileBuffer * allocateBuffer(const size_t page_size, const ChunkKey &key, const size_t num_bytes=0) override
allocates a new CachingFileBuffer and tracks it&#39;s use in the eviction algorithms. ...
int32_t incrementEpoch()
Definition: FileMgr.h:283
size_t getNumDataChunks() const
Returns the number of buffers with chunk data in the CFM.
bool hasWrapperFile(int32_t db_id, int32_t table_id) const
std::unique_ptr< CachingFileMgr > reconstruct() const
Initializes a new CFM using the initialization values in the current CFM.
size_t getTableFileMgrsSize() const
Returns the total size of all subdirectory files. Each table represented in the CFM has a subdirector...
void removeTableBuffers(int32_t db_id, int32_t tb_id)
Erases and cleans up all buffers for a table.
TableFileMgr(const std::string &table_path)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
static constexpr size_t DEFAULT_MAX_SIZE
bool updatePageIfDeleted(FileInfo *file_info, ChunkKey &chunk_key, int32_t contingent, int32_t page_epoch, int32_t page_num) override
checks whether a page should be deleted.
void deleteWrapperFile() const
Deletes only the wrapper file on disk.
An AbstractBuffer is a unit of data management for a data manager.
void incrementEpoch()
increment the epoch for this subdir (not synced to disk).
void setMaxNumDataFiles(size_t max)
void deleteBufferIfExists(const ChunkKey &key)
deletes a buffer if it exists in the mgr. Otherwise do nothing.
static size_t num_pages_per_metadata_file_
Definition: FileMgr.h:418
void writeAndSyncEpochToDisk()
Definition: FileMgr.cpp:647
void removeTableFileMgr(int32_t db_id, int32_t tb_id)
Removes the subdirectory content for a table.
std::optional< size_t > limit_data_size_
#define DEFAULT_PAGE_SIZE
size_t defaultPageSize_
number of threads used when loading data
Definition: FileMgr.h:402
void setMaxNumMetadataFiles(size_t max)
void removeChunkKeepMetadata(const ChunkKey &key)
Free pages for chunk and remove it from the chunk eviction algorithm.
void writeWrapperFile(const std::string &doc) const
Writes wrapper file to disk.
bool hasFileMgrKey() const override
Query to determine if the contained pages will have their database and table ids overriden by the fil...
std::string dumpKeysWithChunkData() const
Definition: Epoch.h:30
void write(int8_t *src, const size_t numBytes, const size_t offset=0, const MemoryLevel srcMemoryLevel=CPU_LEVEL, const int32_t deviceId=-1) override
static constexpr char WRAPPER_FILE_NAME[]
size_t getReservedSpace() const
Returns the disk space used (in bytes) for the subdir.
std::map< TablePair, std::unique_ptr< TableFileMgr > > table_dirs_
std::string dumpKeysWithMetadata() const
std::vector< ChunkKey > getKeysForTable(int32_t db_id, int32_t tb_id) const
returns set of keys contained in chunkIndex_ that match the given table prefix.
void readTableFileMgrs()
Checks for any sub-directories containing table-specific data and creates epochs from found files...
FileInfo * evictMetadataPages()
evicts all metadata pages for the least recently used table. Returns the first FileInfo that a page w...
FileBuffer(FileMgr *fm, const size_t pageSize, const ChunkKey &chunkKey, const size_t initialSize=0)
Constructs a FileBuffer object.
Definition: FileBuffer.cpp:38
int32_t epoch() const
Definition: FileMgr.h:517
void close(FILE *f)
Closes the file pointed to by the FILE pointer.
Definition: File.cpp:128
void init(const size_t num_reader_threads)
Initializes a CFM, parsing any existing files and initializing data structures appropriately (current...
static std::string getDefaultPath(const std::string &base_path)
std::shared_timed_mutex shared_mutex
void free_page(std::pair< FileInfo *, int32_t > &&page) override
Unlike the FileMgr, the CFM frees pages immediately instead of holding them until the next checkpoint...
size_t getNumChunksWithMetadata() const
Returns the number of buffers with metadata in the CFM. Any buffer with an encoder counts...
size_t getChunkSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
FileBuffer * getBufferUnlocked(const ChunkKey &key, const size_t numBytes=0) const override
static constexpr float METADATA_FILE_SPACE_PERCENTAGE
int32_t getEpoch() const
Returns the current epoch (locked)
std::string getTableFileMgrPath(int32_t db, int32_t tb) const
A selection of helper methods for File I/O.
void clearForTable(int32_t db_id, int32_t tb_id)
Removes all data related to the given table (pages and subdirectories).
void removeKey(const ChunkKey &key) const
This file includes the class specification for the Least Recently Used cache eviction algorithm used ...
A FileMgr capable of limiting it&#39;s size and storing data from multiple tables in a shared directory...
void setMaxWrapperSpace(size_t max)
LRUEvictionAlgorithm chunk_evict_alg_
std::string levelAsString() const
FileBuffer * putBuffer(const ChunkKey &key, AbstractBuffer *srcBuffer, const size_t numBytes=0) override
deletes any existing buffer for the given key then copies in a new one.
std::string dumpEvictionQueue() const
FileBuffer * createBufferFromHeaders(const ChunkKey &key, const std::vector< HeaderInfo >::const_iterator &startIt, const std::vector< HeaderInfo >::const_iterator &endIt) override
Creates a buffer and initializes it with info read from files on disk.