OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FileMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #pragma once
26 
27 #include <future>
28 #include <iostream>
29 #include <map>
30 #include <mutex>
31 #include <set>
32 #include <vector>
33 
34 #include "DataMgr/AbstractBuffer.h"
36 #include "DataMgr/FileMgr/Epoch.h"
39 #include "DataMgr/FileMgr/Page.h"
42 
43 using namespace Data_Namespace;
44 
45 namespace boost {
46 namespace filesystem {
47 class directory_iterator;
48 }
49 } // namespace boost
50 
51 namespace File_Namespace {
52 class GlobalFileMgr; // forward declaration
61 using PageSizeFileMMap = std::multimap<size_t, int32_t>;
62 
73 using Chunk = FileBuffer;
74 
85 using ChunkKeyToChunkMap = std::map<ChunkKey, FileBuffer*>;
91 using TablePair = std::pair<const int32_t, const int32_t>;
92 
93 struct FileMetadata {
94  int32_t file_id;
95  std::string file_path;
96  size_t page_size;
97  size_t file_size;
98  size_t num_pages;
100 };
101 
102 struct StorageStats {
103  int32_t epoch{0};
104  int32_t epoch_floor{0};
105  uint32_t metadata_file_count{0};
106  uint64_t total_metadata_file_size{0};
107  uint64_t total_metadata_page_count{0};
108  std::optional<uint64_t> total_free_metadata_page_count{};
109  uint32_t data_file_count{0};
110  uint64_t total_data_file_size{0};
111  uint64_t total_data_page_count{0};
112  std::optional<uint64_t> total_free_data_page_count{};
113  std::optional<uint32_t> fragment_count{};
114 
115  StorageStats() = default;
116  StorageStats(const StorageStats& storage_stats) = default;
117  virtual ~StorageStats() = default;
118 };
119 
121  std::vector<HeaderInfo> header_infos;
122  int32_t max_file_id;
124 };
125 
126 // Page header size is serialized/deserialized as an int.
127 using PageHeaderSizeType = int32_t;
128 
129 struct PageMapping {
131 
132  PageMapping(int32_t source_file_id,
133  size_t source_page_num,
134  PageHeaderSizeType source_page_header_size,
135  int32_t destination_file_id,
136  size_t destination_page_num)
137  : source_file_id(source_file_id)
138  , source_page_num(source_page_num)
139  , source_page_header_size(source_page_header_size)
140  , destination_file_id(destination_file_id)
141  , destination_page_num(destination_page_num) {}
142 
143  int32_t source_file_id;
148 };
149 
154 class FileMgr : public AbstractBufferMgr { // implements
155  friend class GlobalFileMgr;
156 
157  public:
159  FileMgr(const int32_t device_id,
160  GlobalFileMgr* gfm,
161  const TablePair file_mgr_key,
162  const int32_t max_rollback_epochs = -1,
163  const size_t num_reader_threads = 0,
164  const int32_t epoch = -1);
165 
166  // used only to initialize enough to drop or to get basic metadata
167  FileMgr(const int32_t device_id,
168  GlobalFileMgr* gfm,
169  const TablePair file_mgr_key,
170  const bool run_core_init);
171 
172  FileMgr(GlobalFileMgr* gfm, std::string basePath);
173 
175  ~FileMgr() override;
176 
177  StorageStats getStorageStats() const;
179  FileBuffer* createBuffer(const ChunkKey& key,
180  size_t pageSize = 0,
181  const size_t numBytes = 0) override;
182 
183  bool isBufferOnDevice(const ChunkKey& key) override;
185  // Purge == true means delete the data chunks -
186  // can't undelete and revert to previous
187  // state - reclaims disk space for chunk
188  void deleteBuffer(const ChunkKey& key, const bool purge = true) override;
189  void deleteBuffersWithPrefix(const ChunkKey& keyPrefix,
190  const bool purge = true) override;
191 
193  FileBuffer* getBuffer(const ChunkKey& key, const size_t numBytes = 0) override;
194 
195  void fetchBuffer(const ChunkKey& key,
196  AbstractBuffer* destBuffer,
197  const size_t numBytes) override;
198 
205  FileBuffer* putBuffer(const ChunkKey& key,
206  AbstractBuffer* d,
207  const size_t numBytes = 0) override;
208 
209  // Buffer API
210  AbstractBuffer* alloc(const size_t numBytes) override;
211  void free(AbstractBuffer* buffer) override;
212  virtual Page requestFreePage(size_t pagesize, const bool isMetadata);
213 
214  inline MgrType getMgrType() override { return FILE_MGR; };
215  inline std::string getStringMgrType() override { return ToString(FILE_MGR); }
216  inline std::string printSlabs() override { return "Not Implemented"; }
217  inline size_t getMaxSize() override { return 0; }
218  inline size_t getInUseSize() override { return 0; }
219  inline size_t getAllocated() override { return 0; }
220  inline bool isAllocationCapped() override { return false; }
221 
222  inline FileInfo* getFileInfoForFileId(const int32_t fileId) const {
223  return files_.at(fileId);
224  }
225 
226  FileMetadata getMetadataForFile(
227  const boost::filesystem::directory_iterator& fileIterator) const;
228 
229  void init(const size_t num_reader_threads, const int32_t epochOverride);
230  void init(const std::string& dataPathToConvertFrom, const int32_t epochOverride);
231 
232  void copyPage(Page& srcPage,
233  FileMgr* destFileMgr,
234  Page& destPage,
235  const size_t reservedHeaderSize,
236  const size_t numBytes,
237  const size_t offset);
238 
252  void requestFreePages(size_t npages,
253  size_t pagesize,
254  std::vector<Page>& pages,
255  const bool isMetadata);
256 
257  void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector& chunkMetadataVec,
258  const ChunkKey& keyPrefix) override;
259 
260  bool hasChunkMetadataForKeyPrefix(const ChunkKey& keyPrefix);
261 
267  void checkpoint() override;
268  void checkpoint(const int32_t db_id, const int32_t tb_id) override {
269  LOG(FATAL) << "Operation not supported, api checkpoint() should be used instead";
270  }
277  inline virtual int32_t epoch(int32_t db_id, int32_t tb_id) const { return epoch(); }
278 
279  inline int32_t epochFloor() const { return static_cast<int32_t>(epoch_.floor()); }
280 
281  inline int32_t incrementEpoch() {
282  int32_t newEpoch = epoch_.increment();
283  epochIsCheckpointed_ = false;
284  // We test for error here instead of in Epoch::increment so we can log FileMgr
285  // metadata
286  if (newEpoch > Epoch::max_allowable_epoch()) {
287  LOG(FATAL) << "Epoch for table (" << fileMgrKey_.first << ", " << fileMgrKey_.second
288  << ") greater than maximum allowed value of "
289  << Epoch::max_allowable_epoch() << ".";
290  }
291  return newEpoch;
292  }
293 
297  inline int32_t lastCheckpointedEpoch() const {
298  return epoch() - (epochIsCheckpointed_ ? 0 : 1);
299  }
300 
301  inline void resetEpochFloor() { epoch_.floor(epoch_.ceiling()); }
302 
306  inline int32_t maxRollbackEpochs() { return maxRollbackEpochs_; }
307 
312  inline size_t getNumReaderThreads() { return num_reader_threads_; }
313 
321  FILE* getFileForFileId(const int32_t fileId);
322 
323  size_t getNumChunks() override;
324  size_t getNumUsedMetadataPagesForChunkKey(const ChunkKey& chunkKey) const;
325 
327  // #TM Not sure if we need this below
328  int32_t getDBVersion() const;
329  bool getDBConvert() const;
330  void createTopLevelMetadata(); // create metadata shared by all tables of all DBs
331  inline std::string getFileMgrBasePath() const { return fileMgrBasePath_; }
332  virtual void closeRemovePhysical();
333 
334  void removeTableRelatedDS(const int32_t db_id, const int32_t table_id) override;
335 
336  virtual void free_page(std::pair<FileInfo*, int32_t>&& page);
337  inline virtual bool hasFileMgrKey() const { return true; }
338  const TablePair get_fileMgrKey() const { return fileMgrKey_; }
339 
340  boost::filesystem::path getFilePath(const std::string& file_name) const;
341 
342  // Visible for use in unit tests.
343  void writePageMappingsToStatusFile(const std::vector<PageMapping>& page_mappings);
344 
345  // Visible for use in unit tests.
346  void renameCompactionStatusFile(const char* const from_status,
347  const char* const to_status);
348 
349  void compactFiles();
350 
354  virtual bool updatePageIfDeleted(FileInfo* file_info,
355  ChunkKey& chunk_key,
356  int32_t contingent,
357  int32_t page_epoch,
358  int32_t page_num);
359 
363  inline virtual bool failOnReadError() const { return true; }
364 
365  inline size_t getPageSize() const { return page_size_; }
366  inline size_t getMetadataPageSize() const { return metadata_page_size_; }
367 
368  // Used to describe the manager in logging and error messages.
369  virtual std::string describeSelf() const;
370 
371  static constexpr size_t DEFAULT_NUM_PAGES_PER_DATA_FILE{256};
372  static constexpr size_t DEFAULT_NUM_PAGES_PER_METADATA_FILE{4096};
373 
374  // Name of files that indicate the different statuses/phases of data compaction.
375  static constexpr char const* COPY_PAGES_STATUS{"pending_data_compaction_0"};
376  static constexpr char const* UPDATE_PAGE_VISIBILITY_STATUS{"pending_data_compaction_1"};
377  static constexpr char const* DELETE_EMPTY_FILES_STATUS{"pending_data_compaction_2"};
378 
379  // Methods that enable override of number of pages per data/metadata file
380  // for use in unit tests.
381  static void setNumPagesPerDataFile(size_t num_pages);
382  static void setNumPagesPerMetadataFile(size_t num_pages);
383 
384  static void renameAndSymlinkLegacyFiles(const std::string& table_data_dir);
385 
386  static constexpr char LEGACY_EPOCH_FILENAME[] = "epoch";
387  static constexpr char EPOCH_FILENAME[] = "epoch_metadata";
388  static constexpr char DB_META_FILENAME[] = "dbmeta";
389  static constexpr char FILE_MGR_VERSION_FILENAME[] = "filemgr_version";
390  static constexpr int32_t INVALID_VERSION = -1;
391 
392  protected:
393  // Used to initialize CachingFileMgr.
394  FileMgr(const size_t defaultPageSize, const size_t defaultMetadataPageSize);
395 
397  std::string fileMgrBasePath_;
398  std::map<int32_t, FileInfo*>
403  unsigned nextFileId_;
404  int32_t db_version_;
405  int32_t fileMgrVersion_;
407  const int32_t latestFileMgrVersion_{2};
408  FILE* DBMetaFile_ = nullptr;
409  std::mutex getPageMutex_;
412 
414  std::vector<std::pair<FileInfo*, int32_t>> free_pages_;
415  bool isFullyInitted_{false};
416 
419 
436  FileInfo* createFile(const size_t pageSize, const size_t numPages);
437  FileInfo* openExistingFile(const std::string& path,
438  const int32_t fileId,
439  const size_t pageSize,
440  const size_t numPages,
441  std::vector<HeaderInfo>& headerVec);
442  void createEpochFile(const std::string& epochFileName);
443  int32_t openAndReadLegacyEpochFile(const std::string& epochFileName);
444  void openAndReadEpochFile(const std::string& epochFileName);
445  void writeAndSyncEpochToDisk();
446  void setEpoch(const int32_t newEpoch); // resets current value of epoch at startup
447  int32_t readVersionFromDisk(const std::string& versionFileName) const;
448  void writeAndSyncVersionToDisk(const std::string& versionFileName,
449  const int32_t version);
450  void processFileFutures(std::vector<std::future<std::vector<HeaderInfo>>>& file_futures,
451  std::vector<HeaderInfo>& headerVec);
452  virtual FileBuffer* createBufferUnlocked(const ChunkKey& key,
453  size_t pageSize = 0,
454  const size_t numBytes = 0);
455  virtual FileBuffer* createBufferFromHeaders(
456  const ChunkKey& key,
457  const std::vector<HeaderInfo>::const_iterator& headerStartIt,
458  const std::vector<HeaderInfo>::const_iterator& headerEndIt);
459 
460  // Migration functions
461  void migrateToLatestFileMgrVersion();
462  void migrateEpochFileV0();
463  void migrateLegacyFilesV1();
464 
465  OpenFilesResult openFiles();
466 
467  void clearFileInfos();
468 
469  // Data compaction methods
470  void copySourcePageForCompaction(const Page& source_page,
471  FileInfo* destination_file_info,
472  std::vector<PageMapping>& page_mappings,
473  std::set<Page>& touched_pages);
474  int32_t copyPageWithoutHeaderSize(const Page& source_page,
475  const Page& destination_page);
476  void sortAndCopyFilePagesForCompaction(size_t page_size,
477  std::vector<PageMapping>& page_mappings,
478  std::set<Page>& touched_pages);
479  void updateMappedPagesVisibility(const std::vector<PageMapping>& page_mappings);
480  void deleteEmptyFiles();
481  void resumeFileCompaction(const std::string& status_file_name);
482  std::vector<PageMapping> readPageMappingsFromStatusFile();
483 
484  // For testing purposes only
485  FileMgr(const int epoch);
486 
487  void closePhysicalUnlocked();
488  void syncFilesToDisk();
489  void freePages();
490  void initializeNumThreads(size_t num_reader_threads = 0);
491  virtual FileBuffer* allocateBuffer(const size_t page_size,
492  const ChunkKey& key,
493  const size_t num_bytes = 0);
494  virtual FileBuffer* allocateBuffer(
495  const ChunkKey& key,
496  const std::vector<HeaderInfo>::const_iterator& headerStartIt,
497  const std::vector<HeaderInfo>::const_iterator& headerEndIt);
498  virtual ChunkKeyToChunkMap::iterator deleteBufferUnlocked(
499  const ChunkKeyToChunkMap::iterator chunk_it,
500  const bool purge = true);
501  virtual FileBuffer* getBufferUnlocked(const ChunkKey& key,
502  const size_t numBytes = 0) const;
503 
504  private:
505  void rollOffOldData(const int32_t epochCeiling, const bool shouldCheckpoint);
506  void freePagesBeforeEpoch(const int32_t min_epoch);
507  void freePagesBeforeEpochUnlocked(const int32_t min_epoch,
508  const ChunkKeyToChunkMap::iterator lower_bound,
509  const ChunkKeyToChunkMap::iterator upper_bound);
510  FileBuffer* getOrCreateBuffer(const ChunkKey& key);
516  bool coreInit();
517  inline int32_t epoch() const { return static_cast<int32_t>(epoch_.ceiling()); }
518  void writeDirtyBuffers();
519 
520  void setDataAndMetadataFileStats(StorageStats& storage_stats) const;
521  uint32_t getFragmentCount() const;
522 
525 
527  bool epochIsCheckpointed_ = true;
528  FILE* epochFile_ = nullptr;
529 
530  protected:
531  // gfm_ needs to be defined before the page_size_ because it may be used to
532  // initialize it. However, we also want it to remain private, as the CachingFileMgr
533  // should not be allowed to access it (it will be null). page_size_ should be
534  // be protected.
535  const size_t page_size_;
536  const size_t metadata_page_size_;
537 };
538 
539 } // namespace File_Namespace
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
const size_t metadata_page_size_
Definition: FileMgr.h:536
virtual int32_t epoch(int32_t db_id, int32_t tb_id) const
Returns current value of epoch - should be one greater than recorded at last checkpoint. Because FileMgr only contains buffers from one table we can just return the FileMgr&#39;s epoch instead of finding a table-specific epoch.
Definition: FileMgr.h:277
std::vector< int > ChunkKey
Definition: types.h:36
TablePair fileMgrKey_
Global FileMgr.
Definition: FileMgr.h:524
size_t getPageSize() const
Definition: FileMgr.h:365
virtual bool hasFileMgrKey() const
Definition: FileMgr.h:337
const size_t page_size_
Definition: FileMgr.h:535
std::vector< HeaderInfo > header_infos
Definition: FileMgr.h:121
A logical page (Page) belongs to a file on disk.
Definition: Page.h:46
#define LOG(tag)
Definition: Logger.h:283
std::string printSlabs() override
Definition: FileMgr.h:216
size_t getMaxSize() override
Definition: FileMgr.h:217
std::string getFileMgrBasePath() const
Definition: FileMgr.h:331
GlobalFileMgr * gfm_
Definition: FileMgr.h:523
std::mutex getPageMutex_
pointer to DB level metadata
Definition: FileMgr.h:409
Represents/provides access to contiguous data stored in the file system.
Definition: FileBuffer.h:57
static int64_t max_allowable_epoch()
Definition: Epoch.h:69
MgrType getMgrType() override
Definition: FileMgr.h:214
std::string fileMgrBasePath_
Definition: FileMgr.h:397
std::multimap< size_t, int32_t > PageSizeFileMMap
Maps logical page sizes to files.
Definition: FileMgr.h:61
int32_t lastCheckpointedEpoch() const
Returns value of epoch at last checkpoint.
Definition: FileMgr.h:297
size_t getInUseSize() override
Definition: FileMgr.h:218
static size_t num_pages_per_data_file_
Definition: FileMgr.h:417
This file includes the class specification for the FILE manager (FileMgr), and related data structure...
int32_t PageHeaderSizeType
Definition: FileMgr.h:127
int32_t db_version_
the index of the next file id
Definition: FileMgr.h:404
PageMapping(int32_t source_file_id, size_t source_page_num, PageHeaderSizeType source_page_header_size, int32_t destination_file_id, size_t destination_page_num)
Definition: FileMgr.h:132
int32_t incrementEpoch()
Definition: FileMgr.h:281
void init(LogOptions const &log_opts)
Definition: Logger.cpp:360
string version
Definition: setup.in.py:73
ChunkKeyToChunkMap chunkIndex_
Definition: FileMgr.h:326
PageSizeFileMMap fileIndex_
A map of files accessible via a file identifier.
Definition: FileMgr.h:401
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
std::vector< std::pair< FileInfo *, int32_t > > free_pages_
Definition: FileMgr.h:414
An AbstractBuffer is a unit of data management for a data manager.
size_t num_reader_threads_
Maps page sizes to FileInfo objects.
Definition: FileMgr.h:402
static size_t num_pages_per_metadata_file_
Definition: FileMgr.h:418
std::map< ChunkKey, FileBuffer * > ChunkKeyToChunkMap
Maps ChunkKeys (unique ids for Chunks) to Chunk objects.
Definition: FileMgr.h:85
num_reader_threads_(num_reader_threads)
std::string compaction_status_file_name
Definition: FileMgr.h:123
heavyai::shared_mutex chunkIndexMutex_
Definition: FileMgr.h:410
const TablePair get_fileMgrKey() const
Definition: FileMgr.h:338
heavyai::shared_mutex mutex_free_page_
Definition: FileMgr.h:413
int32_t maxRollbackEpochs_
Definition: FileMgr.h:396
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78
Definition: Epoch.h:30
void checkpoint(const int32_t db_id, const int32_t tb_id) override
Definition: FileMgr.h:268
virtual bool failOnReadError() const
True if a read error should cause a fatal error.
Definition: FileMgr.h:363
std::string getStringMgrType() override
Definition: FileMgr.h:215
size_t getAllocated() override
Definition: FileMgr.h:219
This file contains the declaration and definition of a Page type and a MultiPage type.
int32_t epoch() const
Definition: FileMgr.h:517
std::map< int32_t, FileInfo * > files_
Definition: FileMgr.h:400
FileInfo * getFileInfoForFileId(const int32_t fileId) const
Definition: FileMgr.h:222
heavyai::shared_mutex files_rw_mutex_
Definition: FileMgr.h:411
int32_t maxRollbackEpochs()
Returns value max_rollback_epochs.
Definition: FileMgr.h:306
std::shared_timed_mutex shared_mutex
std::pair< const int32_t, const int32_t > TablePair
Definition: FileMgr.h:91
unsigned nextFileId_
number of threads used when loading data
Definition: FileMgr.h:403
int32_t epochFloor() const
Definition: FileMgr.h:279
PageHeaderSizeType source_page_header_size
Definition: FileMgr.h:145
bool isAllocationCapped() override
Definition: FileMgr.h:220
size_t getMetadataPageSize() const
Definition: FileMgr.h:366
size_t getNumReaderThreads()
Returns number of threads defined by parameter num-reader-threads which should be used during initial...
Definition: FileMgr.h:312