OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CachingFileMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
30 #pragma once
31 
32 #include <optional>
34 #include "FileMgr.h"
35 #include "Shared/File.h"
36 #include "Shared/SysDefinitions.h"
37 
38 namespace File_Namespace {
39 
40 enum class DiskCacheLevel { none, fsi, non_fsi, all };
42  // Note the suffix UL is not portable. 'long' i a different size
43  // on nix vs windows. Hense the explicit cast to long.
44  static constexpr size_t DEFAULT_MAX_SIZE{(size_t)1024 * (size_t)1024 * (size_t)1024 *
45  (size_t)100}; // 100G default
46  std::string path;
48  size_t num_reader_threads = 0;
51  inline bool isEnabledForMutableTables() const {
54  }
55  inline bool isEnabledForFSI() const {
57  }
58  inline bool isEnabled() const { return enabled_level != DiskCacheLevel::none; }
59  std::string dump() const {
60  std::stringstream ss;
61  ss << "DiskCacheConfig(path = " << path << ", level = " << levelAsString()
62  << ", threads = " << num_reader_threads << ", size limit = " << size_limit
63  << ", page size = " << page_size << ")";
64  return ss.str();
65  }
66  std::string levelAsString() const {
67  switch (enabled_level) {
69  return "none";
71  return "fsi";
73  return "non_fsi";
75  return "all";
76  }
77  return "";
78  }
79  static std::string getDefaultPath(const std::string& base_path) {
80  return base_path + "/" + shared::kDefaultDiskCacheDirName;
81  }
82 };
83 
84 inline std::string get_dir_name_for_table(int db_id, int tb_id) {
85  std::stringstream file_name;
86  file_name << "table_" << db_id << "_" << tb_id << "/";
87  return file_name.str();
88 }
89 
90 // Class to control access to the table-specific directories and data created inside a
91 // CachingFileMgr.
92 class TableFileMgr {
93  public:
94  TableFileMgr(const std::string& table_path);
96 
100  void incrementEpoch();
101 
106 
110  int32_t getEpoch() const;
111 
115  void removeDiskContent() const;
116 
120  size_t getReservedSpace() const;
121 
125  void deleteWrapperFile() const;
126 
130  void writeWrapperFile(const std::string& doc) const;
131 
135  bool hasWrapperFile() const;
136 
137  private:
138  std::string table_path_;
139  std::string epoch_file_path_;
140  std::string wrapper_file_path_;
142  bool is_checkpointed_ = true;
143  FILE* epoch_file_ = nullptr;
144 
146 };
147 
148 // Extension of FileBuffer with restricted behaviour.
150  public:
152  // The cache can only be appended to, not written, as it lets us maintain a single
153  // version of the data. This override is to make sure we don't accidentally start
154  // writing to cache buffers.
155  void write(int8_t* src,
156  const size_t numBytes,
157  const size_t offset = 0,
158  const MemoryLevel srcMemoryLevel = CPU_LEVEL,
159  const int32_t deviceId = -1) override {
160  UNREACHABLE() << "Cache buffers support append(), but not write()";
161  }
162 };
163 
172 class CachingFileMgr : public FileMgr {
173  public:
174  static constexpr char WRAPPER_FILE_NAME[] = "wrapper_metadata.json";
175  // We currently assign %10 of the cache to data wrapper space arbitrarily.
176  // static constexpr size_t WRAPPER_SPACE_RATIO{10};
177  // Portion of the CFM space reserved for metadata (metadata files and data wrappers)
178  static constexpr float METADATA_SPACE_PERCENTAGE{0.1};
179  // Portion of the CFM metadata space reserved for metadata files (subset of above).
180  static constexpr float METADATA_FILE_SPACE_PERCENTAGE{0.01};
181 
182  static size_t getMinimumSize() {
183  // Currently the minimum default size is based on the metadata file size and
184  // percentage usage.
187  }
188 
189  CachingFileMgr(const DiskCacheConfig& config);
190 
191  ~CachingFileMgr() override;
192 
193  // Simple getters.
194  inline MgrType getMgrType() override { return CACHING_FILE_MGR; };
195  inline std::string getStringMgrType() override { return ToString(CACHING_FILE_MGR); }
196  inline size_t getPageSize() { return page_size_; }
197  inline size_t getMaxSize() override { return max_size_; }
198  inline size_t getMaxDataFiles() const { return max_num_data_files_; }
199  inline size_t getMaxMetaFiles() const { return max_num_meta_files_; }
200  inline size_t getMaxWrapperSize() const { return max_wrapper_space_; }
201  inline size_t getDataFileSize() const { return page_size_ * num_pages_per_data_file_; }
202  inline size_t getMetadataFileSize() const {
204  }
205 
206  size_t getNumDataFiles() const;
207  size_t getNumMetaFiles() const;
208  inline size_t getAvailableSpace() { return max_size_ - getAllocated(); }
209  inline size_t getAvailableWrapperSpace() {
211  }
212  inline size_t getAllocated() override {
213  return getFilesSize() + getTableFileMgrsSize();
214  }
215  size_t getMaxDataFilesSize() const;
216 
220  void removeChunkKeepMetadata(const ChunkKey& key);
221 
225  void clearForTable(int32_t db_id, int32_t tb_id);
226 
231  inline bool hasFileMgrKey() const override { return false; }
232 
236  void closeRemovePhysical() override;
237 
241  size_t getChunkSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
242  size_t getMetadataSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
243  size_t getTableFileMgrSpaceReserved(int32_t db_id, int32_t tb_id) const;
244  size_t getSpaceReservedByTable(int32_t db_id, int32_t tb_id) const;
245 
249  std::string describeSelf() const override;
250 
255  void checkpoint(const int32_t db_id, const int32_t tb_id) override;
256 
260  int32_t epoch(int32_t db_id, int32_t tb_id) const override;
261 
265  FileBuffer* putBuffer(const ChunkKey& key,
266  AbstractBuffer* srcBuffer,
267  const size_t numBytes = 0) override;
272  CachingFileBuffer* allocateBuffer(const size_t page_size,
273  const ChunkKey& key,
274  const size_t num_bytes = 0) override;
276  const ChunkKey& key,
277  const std::vector<HeaderInfo>::const_iterator& headerStartIt,
278  const std::vector<HeaderInfo>::const_iterator& headerEndIt) override;
279 
283  bool updatePageIfDeleted(FileInfo* file_info,
284  ChunkKey& chunk_key,
285  int32_t contingent,
286  int32_t page_epoch,
287  int32_t page_num) override;
288 
292  inline bool failOnReadError() const override { return false; }
293 
297  void deleteBufferIfExists(const ChunkKey& key);
298 
303  size_t getNumChunksWithMetadata() const;
304 
308  size_t getNumDataChunks() const;
309 
313  std::vector<ChunkKey> getChunkKeysForPrefix(const ChunkKey& prefix) const;
314 
318  std::unique_ptr<CachingFileMgr> reconstruct() const;
319 
323  void deleteWrapperFile(int32_t db, int32_t tb);
324 
328  void writeWrapperFile(const std::string& doc, int32_t db, int32_t tb);
329 
333  bool hasWrapperFile(int32_t db_id, int32_t table_id) const;
334 
335  std::string getTableFileMgrPath(int32_t db, int32_t tb) const;
336 
341  size_t getFilesSize() const;
342 
347  size_t getTableFileMgrsSize() const;
348 
352  std::optional<FileBuffer*> getBufferIfExists(const ChunkKey& key);
353 
358  void free_page(std::pair<FileInfo*, int32_t>&& page) override;
359 
361  const ChunkKey& keyPrefix) override;
362 
363  // Useful for debugging.
364  std::string dumpKeysWithMetadata() const;
365  std::string dumpKeysWithChunkData() const;
366  std::string dumpTableQueue() const { return table_evict_alg_.dumpEvictionQueue(); }
367  std::string dumpEvictionQueue() const { return chunk_evict_alg_.dumpEvictionQueue(); }
368  std::string dump() const;
369 
370  // Used for unit testing
371  void setMaxNumDataFiles(size_t max) { max_num_data_files_ = max; }
372  void setMaxNumMetadataFiles(size_t max) { max_num_meta_files_ = max; }
373  void setMaxWrapperSpace(size_t max) { max_wrapper_space_ = max; }
374  std::set<ChunkKey> getKeysWithMetadata() const;
375  void setDataSizeLimit(size_t max) { limit_data_size_ = max; }
376 
377  private:
381  void incrementEpoch(int32_t db_id, int32_t tb_id);
382 
387  void init(const size_t num_reader_threads);
388 
392  void writeAndSyncEpochToDisk(int32_t db_id, int32_t tb_id);
393 
398  void readTableFileMgrs();
399 
404  const ChunkKey& key,
405  const std::vector<HeaderInfo>::const_iterator& startIt,
406  const std::vector<HeaderInfo>::const_iterator& endIt) override;
407 
412  size_t pageSize = 0,
413  const size_t numBytes = 0) override;
414 
418  void createTableFileMgrIfNoneExists(const int32_t db_id, const int32_t tb_id);
419 
423  void incrementAllEpochs();
424 
428  void removeTableFileMgr(int32_t db_id, int32_t tb_id);
429 
433  void removeTableBuffers(int32_t db_id, int32_t tb_id);
434 
438  void writeDirtyBuffers(int32_t db_id, int32_t tb_id);
439 
444  Page requestFreePage(size_t pagesize, const bool isMetadata) override;
445 
449  void touchKey(const ChunkKey& key) const;
450  void removeKey(const ChunkKey& key) const;
451 
456  std::vector<ChunkKey> getKeysForTable(int32_t db_id, int32_t tb_id) const;
457 
464 
470  FileInfo* evictPages();
471 
477  void deleteCacheIfTooLarge();
478 
483  void setMaxSizes();
484 
486  const size_t numBytes = 0) const override;
487  ChunkKeyToChunkMap::iterator deleteBufferUnlocked(
488  const ChunkKeyToChunkMap::iterator chunk_it,
489  const bool purge = true) override;
490 
491  mutable heavyai::shared_mutex table_dirs_mutex_; // mutex for table_dirs_.
492  // each table gest a separate epoch. Uses pointers for move semantics.
493  std::map<TablePair, std::unique_ptr<TableFileMgr>> table_dirs_;
494 
495  size_t max_num_data_files_; // set based on max_size_.
496  size_t max_num_meta_files_; // set based on max_size_.
497  size_t max_wrapper_space_; // set based on max_size_.
498  size_t max_size_;
499  std::optional<size_t> limit_data_size_{}; // Used for testing artifically small caches.
500 
501  mutable LRUEvictionAlgorithm chunk_evict_alg_; // last chunk touched.
502  mutable LRUEvictionAlgorithm table_evict_alg_; // last table touched.
503 };
504 
505 } // namespace File_Namespace
const size_t metadata_page_size_
Definition: FileMgr.h:536
size_t getTableFileMgrSpaceReserved(int32_t db_id, int32_t tb_id) const
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunkMetadataVec, const ChunkKey &keyPrefix) override
std::vector< int > ChunkKey
Definition: types.h:36
LRUEvictionAlgorithm table_evict_alg_
std::string dumpTableQueue() const
void removeDiskContent() const
Removes all disk data for the subdir.
std::string getStringMgrType() override
static constexpr size_t DEFAULT_NUM_PAGES_PER_METADATA_FILE
Definition: FileMgr.h:372
const size_t page_size_
Definition: FileMgr.h:535
static constexpr float METADATA_SPACE_PERCENTAGE
heavyai::shared_mutex table_dirs_mutex_
const std::string kDefaultDiskCacheDirName
std::string get_dir_name_for_table(int db_id, int tb_id)
void writeWrapperFile(const std::string &doc, int32_t db, int32_t tb)
Writes a wrapper file to a table subdir.
A logical page (Page) belongs to a file on disk.
Definition: Page.h:46
size_t getFilesSize() const
Get the total size of page files (data and metadata files). This includes allocated, but unused space.
std::vector< ChunkKey > getChunkKeysForPrefix(const ChunkKey &prefix) const
Returns the keys for chunks with chunk data that match the given prefix.
void setMaxSizes()
Sets the maximum number of files/space for each type of storage based on the maximum size...
void writeAndSyncEpochToDisk()
Write and flush the epoch to the epoch file on disk.
#define UNREACHABLE()
Definition: Logger.h:333
This file includes the class specification for the FILE manager (FileMgr), and related data structure...
std::string describeSelf() const override
describes this FileMgr for logging purposes.
size_t getSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
void closeRemovePhysical() override
Closes files and removes the caching directory.
void touchKey(const ChunkKey &key) const
Used to track which tables/chunks were least recently used.
#define DEFAULT_METADATA_PAGE_SIZE
size_t getMetadataSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
void createTableFileMgrIfNoneExists(const int32_t db_id, const int32_t tb_id)
Create and initialize a subdirectory for a table if none exists.
Represents/provides access to contiguous data stored in the file system.
Definition: FileBuffer.h:57
void checkpoint() override
Fsyncs data files, writes out epoch and fsyncs that.
Definition: FileMgr.cpp:703
Page requestFreePage(size_t pagesize, const bool isMetadata) override
requests a free page similar to FileMgr, but this override will also evict existing pages to make spa...
void deleteWrapperFile(int32_t db, int32_t tb)
Deletes the wrapper file from a table subdir.
ChunkKeyToChunkMap::iterator deleteBufferUnlocked(const ChunkKeyToChunkMap::iterator chunk_it, const bool purge=true) override
CachingFileMgr(const DiskCacheConfig &config)
static size_t num_pages_per_data_file_
Definition: FileMgr.h:417
std::optional< FileBuffer * > getBufferIfExists(const ChunkKey &key)
an optional version of get buffer if we are not sure a chunk exists.
std::set< ChunkKey > getKeysWithMetadata() const
bool failOnReadError() const override
True if a read error should cause a fatal error.
FileInfo * evictPages()
evicts all data pages for the least recently used Chunk (metadata pages persist). Returns the first F...
FileBuffer * createBufferUnlocked(const ChunkKey &key, size_t pageSize=0, const size_t numBytes=0) override
Creates a buffer.
void deleteCacheIfTooLarge()
When the cache is read from disk, we don&#39;t know which chunks were least recently used. Rather than try to evict random pages to get down to size we just reset the cache to make sure we have space.
heavyai::shared_mutex table_mutex_
void incrementAllEpochs()
Increment epochs for each table in the CFM.
CachingFileBuffer * allocateBuffer(const size_t page_size, const ChunkKey &key, const size_t num_bytes=0) override
allocates a new CachingFileBuffer and tracks it&#39;s use in the eviction algorithms. ...
int32_t incrementEpoch()
Definition: FileMgr.h:281
size_t getNumDataChunks() const
Returns the number of buffers with chunk data in the CFM.
bool hasWrapperFile(int32_t db_id, int32_t table_id) const
std::unique_ptr< CachingFileMgr > reconstruct() const
Initializes a new CFM using the initialization values in the current CFM.
size_t getTableFileMgrsSize() const
Returns the total size of all subdirectory files. Each table represented in the CFM has a subdirector...
void removeTableBuffers(int32_t db_id, int32_t tb_id)
Erases and cleans up all buffers for a table.
TableFileMgr(const std::string &table_path)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
static constexpr size_t DEFAULT_MAX_SIZE
bool updatePageIfDeleted(FileInfo *file_info, ChunkKey &chunk_key, int32_t contingent, int32_t page_epoch, int32_t page_num) override
checks whether a page should be deleted.
void deleteWrapperFile() const
Deletes only the wrapper file on disk.
An AbstractBuffer is a unit of data management for a data manager.
void incrementEpoch()
increment the epoch for this subdir (not synced to disk).
void setMaxNumDataFiles(size_t max)
void deleteBufferIfExists(const ChunkKey &key)
deletes a buffer if it exists in the mgr. Otherwise do nothing.
static size_t num_pages_per_metadata_file_
Definition: FileMgr.h:418
void writeAndSyncEpochToDisk()
Definition: FileMgr.cpp:656
void removeTableFileMgr(int32_t db_id, int32_t tb_id)
Removes the subdirectory content for a table.
std::optional< size_t > limit_data_size_
#define DEFAULT_PAGE_SIZE
void setMaxNumMetadataFiles(size_t max)
void removeChunkKeepMetadata(const ChunkKey &key)
Free pages for chunk and remove it from the chunk eviction algorithm.
void writeWrapperFile(const std::string &doc) const
Writes wrapper file to disk.
bool hasFileMgrKey() const override
Query to determine if the contained pages will have their database and table ids overriden by the fil...
std::string dumpKeysWithChunkData() const
Definition: Epoch.h:30
void write(int8_t *src, const size_t numBytes, const size_t offset=0, const MemoryLevel srcMemoryLevel=CPU_LEVEL, const int32_t deviceId=-1) override
static constexpr char WRAPPER_FILE_NAME[]
size_t getReservedSpace() const
Returns the disk space used (in bytes) for the subdir.
std::map< TablePair, std::unique_ptr< TableFileMgr > > table_dirs_
std::string dumpKeysWithMetadata() const
std::vector< ChunkKey > getKeysForTable(int32_t db_id, int32_t tb_id) const
returns set of keys contained in chunkIndex_ that match the given table prefix.
void readTableFileMgrs()
Checks for any sub-directories containing table-specific data and creates epochs from found files...
FileInfo * evictMetadataPages()
evicts all metadata pages for the least recently used table. Returns the first FileInfo that a page w...
FileBuffer(FileMgr *fm, const size_t pageSize, const ChunkKey &chunkKey, const size_t initialSize=0)
Constructs a FileBuffer object.
Definition: FileBuffer.cpp:38
int32_t epoch() const
Definition: FileMgr.h:517
void close(FILE *f)
Closes the file pointed to by the FILE pointer.
Definition: File.cpp:128
void init(const size_t num_reader_threads)
Initializes a CFM, parsing any existing files and initializing data structures appropriately (current...
static std::string getDefaultPath(const std::string &base_path)
std::shared_timed_mutex shared_mutex
void free_page(std::pair< FileInfo *, int32_t > &&page) override
Unlike the FileMgr, the CFM frees pages immediately instead of holding them until the next checkpoint...
size_t getNumChunksWithMetadata() const
Returns the number of buffers with metadata in the CFM. Any buffer with an encoder counts...
size_t getChunkSpaceReservedByTable(int32_t db_id, int32_t tb_id) const
FileBuffer * getBufferUnlocked(const ChunkKey &key, const size_t numBytes=0) const override
static constexpr float METADATA_FILE_SPACE_PERCENTAGE
int32_t getEpoch() const
Returns the current epoch (locked)
std::string getTableFileMgrPath(int32_t db, int32_t tb) const
A selection of helper methods for File I/O.
void clearForTable(int32_t db_id, int32_t tb_id)
Removes all data related to the given table (pages and subdirectories).
void removeKey(const ChunkKey &key) const
This file includes the class specification for the Least Recently Used cache eviction algorithm used ...
A FileMgr capable of limiting it&#39;s size and storing data from multiple tables in a shared directory...
void setMaxWrapperSpace(size_t max)
LRUEvictionAlgorithm chunk_evict_alg_
std::string levelAsString() const
FileBuffer * putBuffer(const ChunkKey &key, AbstractBuffer *srcBuffer, const size_t numBytes=0) override
deletes any existing buffer for the given key then copies in a new one.
std::string dumpEvictionQueue() const
FileBuffer * createBufferFromHeaders(const ChunkKey &key, const std::vector< HeaderInfo >::const_iterator &startIt, const std::vector< HeaderInfo >::const_iterator &endIt) override
Creates a buffer and initializes it with info read from files on disk.