OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FileMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #pragma once
26 
27 #include <future>
28 #include <iostream>
29 #include <map>
30 #include <mutex>
31 #include <set>
32 #include <vector>
33 
34 #include "DataMgr/AbstractBuffer.h"
36 #include "DataMgr/FileMgr/Epoch.h"
39 #include "DataMgr/FileMgr/Page.h"
42 
43 using namespace Data_Namespace;
44 
45 namespace boost {
46 namespace filesystem {
47 class directory_iterator;
48 }
49 } // namespace boost
50 
51 namespace File_Namespace {
53 /* In future kDbVersion may be added to AbstractBufferMgr class.
54  * This will allow support of different dbVersions for different tables, so
55  * original tables can be generated by different versions of mapd software.
56  */
57 constexpr int32_t kDbVersion{2};
58 
59 class GlobalFileMgr; // forward declaration
68 using PageSizeFileMMap = std::multimap<size_t, int32_t>;
69 
80 using Chunk = FileBuffer;
81 
92 using ChunkKeyToChunkMap = std::map<ChunkKey, FileBuffer*>;
98 using TablePair = std::pair<const int32_t, const int32_t>;
99 
100 struct FileMetadata {
101  int32_t file_id;
102  std::string file_path;
103  size_t page_size;
104  size_t file_size;
105  size_t num_pages;
107 };
108 
109 struct StorageStats {
110  int32_t epoch{0};
111  int32_t epoch_floor{0};
112  uint32_t metadata_file_count{0};
113  uint64_t total_metadata_file_size{0};
114  uint64_t total_metadata_page_count{0};
115  std::optional<uint64_t> total_free_metadata_page_count{};
116  uint32_t data_file_count{0};
117  uint64_t total_data_file_size{0};
118  uint64_t total_data_page_count{0};
119  std::optional<uint64_t> total_free_data_page_count{};
120  std::optional<uint32_t> fragment_count{};
121 
122  StorageStats() = default;
123  StorageStats(const StorageStats& storage_stats) = default;
124  virtual ~StorageStats() = default;
125 };
126 
128  std::vector<HeaderInfo> header_infos;
129  int32_t max_file_id;
131 };
132 
133 // Page header size is serialized/deserialized as an int.
134 using PageHeaderSizeType = int32_t;
135 
136 struct PageMapping {
138 
139  PageMapping(int32_t source_file_id,
140  size_t source_page_num,
141  PageHeaderSizeType source_page_header_size,
142  int32_t destination_file_id,
143  size_t destination_page_num)
144  : source_file_id(source_file_id)
145  , source_page_num(source_page_num)
146  , source_page_header_size(source_page_header_size)
147  , destination_file_id(destination_file_id)
148  , destination_page_num(destination_page_num) {}
149 
150  int32_t source_file_id;
155 };
156 
161 class FileMgr : public AbstractBufferMgr { // implements
162  friend class GlobalFileMgr;
163 
164  public:
166  FileMgr(const int32_t device_id,
167  GlobalFileMgr* gfm,
168  const TablePair file_mgr_key,
169  const int32_t max_rollback_epochs = -1,
170  const size_t num_reader_threads = 0,
171  const int32_t epoch = -1);
172 
173  // used only to initialize enough to drop or to get basic metadata
174  FileMgr(const int32_t device_id,
175  GlobalFileMgr* gfm,
176  const TablePair file_mgr_key,
177  const bool run_core_init);
178 
179  FileMgr(GlobalFileMgr* gfm, std::string basePath);
180 
182  ~FileMgr() override;
183 
184  StorageStats getStorageStats() const;
186  FileBuffer* createBuffer(const ChunkKey& key,
187  size_t pageSize = 0,
188  const size_t numBytes = 0) override;
189 
190  bool isBufferOnDevice(const ChunkKey& key) override;
192  // Purge == true means delete the data chunks -
193  // can't undelete and revert to previous
194  // state - reclaims disk space for chunk
195  void deleteBuffer(const ChunkKey& key, const bool purge = true) override;
196  void deleteBuffersWithPrefix(const ChunkKey& keyPrefix,
197  const bool purge = true) override;
198 
200  FileBuffer* getBuffer(const ChunkKey& key, const size_t numBytes = 0) override;
201 
202  void fetchBuffer(const ChunkKey& key,
203  AbstractBuffer* destBuffer,
204  const size_t numBytes) override;
205 
212  FileBuffer* putBuffer(const ChunkKey& key,
213  AbstractBuffer* d,
214  const size_t numBytes = 0) override;
215 
216  // Buffer API
217  AbstractBuffer* alloc(const size_t numBytes) override;
218  void free(AbstractBuffer* buffer) override;
219  virtual Page requestFreePage(size_t pagesize, const bool isMetadata);
220 
221  inline MgrType getMgrType() override { return FILE_MGR; };
222  inline std::string getStringMgrType() override { return ToString(FILE_MGR); }
223  inline std::string printSlabs() override { return "Not Implemented"; }
224  inline size_t getMaxSize() override { return 0; }
225  inline size_t getInUseSize() override { return 0; }
226  inline size_t getAllocated() override { return 0; }
227  inline bool isAllocationCapped() override { return false; }
228 
229  inline FileInfo* getFileInfoForFileId(const int32_t fileId) const {
230  return files_.at(fileId).get();
231  }
232 
233  FileMetadata getMetadataForFile(
234  const boost::filesystem::directory_iterator& fileIterator) const;
235 
236  void copyPage(Page& srcPage,
237  FileMgr* destFileMgr,
238  Page& destPage,
239  const size_t reservedHeaderSize,
240  const size_t numBytes,
241  const size_t offset);
242 
256  void requestFreePages(size_t npages,
257  size_t pagesize,
258  std::vector<Page>& pages,
259  const bool isMetadata);
260 
261  void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector& chunkMetadataVec,
262  const ChunkKey& keyPrefix) override;
263 
264  bool hasChunkMetadataForKeyPrefix(const ChunkKey& keyPrefix);
265 
271  void checkpoint() override;
272  void checkpoint(const int32_t db_id, const int32_t tb_id) override {
273  LOG(FATAL) << "Operation not supported, api checkpoint() should be used instead";
274  }
281  inline virtual int32_t epoch(int32_t db_id, int32_t tb_id) const { return epoch(); }
282 
283  inline int32_t epochFloor() const { return static_cast<int32_t>(epoch_.floor()); }
284 
285  inline int32_t incrementEpoch() {
286  int32_t newEpoch = epoch_.increment();
287  epochIsCheckpointed_ = false;
288  // We test for error here instead of in Epoch::increment so we can log FileMgr
289  // metadata
290  if (newEpoch > Epoch::max_allowable_epoch()) {
291  LOG(FATAL) << "Epoch for table (" << fileMgrKey_.first << ", " << fileMgrKey_.second
292  << ") greater than maximum allowed value of "
293  << Epoch::max_allowable_epoch() << ".";
294  }
295  return newEpoch;
296  }
297 
301  inline int32_t lastCheckpointedEpoch() const {
302  return epoch() - (epochIsCheckpointed_ ? 0 : 1);
303  }
304 
305  inline void resetEpochFloor() { epoch_.floor(epoch_.ceiling()); }
306 
310  inline int32_t maxRollbackEpochs() { return maxRollbackEpochs_; }
311 
316  inline size_t getNumReaderThreads() { return num_reader_threads_; }
317 
325  FILE* getFileForFileId(const int32_t fileId);
326 
327  size_t getNumChunks() override;
328  size_t getNumUsedMetadataPagesForChunkKey(const ChunkKey& chunkKey) const;
329 
331  // #TM Not sure if we need this below
332  bool getDBConvert() const;
333  void createOrMigrateTopLevelMetadata();
334  inline std::string getFileMgrBasePath() const { return fileMgrBasePath_; }
335  virtual void closeRemovePhysical();
336 
337  void removeTableRelatedDS(const int32_t db_id, const int32_t table_id) override;
338 
339  virtual void free_page(std::pair<FileInfo*, int32_t>&& page);
340  inline virtual bool hasFileMgrKey() const { return true; }
341  const TablePair get_fileMgrKey() const { return fileMgrKey_; }
342 
343  boost::filesystem::path getFilePath(const std::string& file_name) const;
344 
345  // Visible for use in unit tests.
346  void writePageMappingsToStatusFile(const std::vector<PageMapping>& page_mappings);
347 
348  // Visible for use in unit tests.
349  void renameCompactionStatusFile(const char* const from_status,
350  const char* const to_status);
351 
352  void compactFiles();
353 
357  virtual bool updatePageIfDeleted(FileInfo* file_info,
358  ChunkKey& chunk_key,
359  int32_t contingent,
360  int32_t page_epoch,
361  int32_t page_num);
362 
366  inline virtual bool failOnReadError() const { return true; }
367 
368  inline size_t getPageSize() const { return page_size_; }
369  inline size_t getMetadataPageSize() const { return metadata_page_size_; }
370 
371  // Used to describe the manager in logging and error messages.
372  virtual std::string describeSelf() const;
373 
374  FILE* createFile(const std::string& full_path, const size_t requested_file_size) const;
375  std::pair<FILE*, std::string> createFile(const std::string& base_path,
376  const int file_id,
377  const size_t page_size,
378  const size_t num_pages) const;
379  size_t writeFile(FILE* f,
380  const size_t offset,
381  const size_t size,
382  const int8_t* buf) const;
383 
384  static constexpr size_t DEFAULT_NUM_PAGES_PER_DATA_FILE{256};
385  static constexpr size_t DEFAULT_NUM_PAGES_PER_METADATA_FILE{4096};
386 
387  // Name of files that indicate the different statuses/phases of data compaction.
388  static constexpr char const* COPY_PAGES_STATUS{"pending_data_compaction_0"};
389  static constexpr char const* UPDATE_PAGE_VISIBILITY_STATUS{"pending_data_compaction_1"};
390  static constexpr char const* DELETE_EMPTY_FILES_STATUS{"pending_data_compaction_2"};
391 
392  // Methods that enable override of number of pages per data/metadata file
393  // for use in unit tests.
394  static void setNumPagesPerDataFile(size_t num_pages);
395  static void setNumPagesPerMetadataFile(size_t num_pages);
396 
397  static void renameAndSymlinkLegacyFiles(const std::string& table_data_dir);
398 
399  static constexpr char LEGACY_EPOCH_FILENAME[] = "epoch";
400  static constexpr char EPOCH_FILENAME[] = "epoch_metadata";
401  static constexpr char DB_META_FILENAME[] = "dbmeta";
402  static constexpr char FILE_MGR_VERSION_FILENAME[] = "filemgr_version";
403  static constexpr int32_t INVALID_VERSION = -1;
404  static constexpr int32_t LATEST_FILE_MGR_VERSION = 2;
405 
406  protected:
407  // Used to initialize CachingFileMgr.
408  FileMgr(const size_t defaultPageSize, const size_t defaultMetadataPageSize);
409 
411  std::string fileMgrBasePath_;
412  std::map<int32_t, std::unique_ptr<FileInfo>> files_;
416  unsigned nextFileId_;
418  FILE* DBMetaFile_ = nullptr;
419  std::mutex getPageMutex_;
422 
424  std::vector<std::pair<FileInfo*, int32_t>> free_pages_;
425  bool isFullyInitted_{false};
426 
429 
446  FileInfo* createFileInfo(const size_t pageSize, const size_t numPages);
447  FileInfo* openExistingFile(const std::string& path,
448  const int32_t fileId,
449  const size_t pageSize,
450  const size_t numPages,
451  std::vector<HeaderInfo>& headerVec);
452  void createEpochFile(const std::string& epochFileName);
453  int32_t openAndReadLegacyEpochFile(const std::string& epochFileName);
454  void openAndReadEpochFile(const std::string& epochFileName);
455  void writeAndSyncEpochToDisk();
456  void setEpoch(const int32_t newEpoch); // resets current value of epoch at startup
457  int32_t readVersionFromDisk(const std::string& versionFileName) const;
458  void writeAndSyncVersionToDisk(const std::string& versionFileName,
459  const int32_t version);
460  void processFileFutures(std::vector<std::future<std::vector<HeaderInfo>>>& file_futures,
461  std::vector<HeaderInfo>& headerVec);
462  virtual FileBuffer* createBufferUnlocked(const ChunkKey& key,
463  size_t pageSize = 0,
464  const size_t numBytes = 0);
465  virtual FileBuffer* createBufferFromHeaders(
466  const ChunkKey& key,
467  const std::vector<HeaderInfo>::const_iterator& headerStartIt,
468  const std::vector<HeaderInfo>::const_iterator& headerEndIt);
469 
470  // Migration functions
471  void migrateToLatestFileMgrVersion();
472  void migrateEpochFileV0();
473  void migrateLegacyFilesV1();
474 
475  OpenFilesResult openFiles();
476 
477  void clearFileInfos();
478 
479  // Data compaction methods
480  void copySourcePageForCompaction(const Page& source_page,
481  FileInfo* destination_file_info,
482  std::vector<PageMapping>& page_mappings,
483  std::set<Page>& touched_pages);
484  int32_t copyPageWithoutHeaderSize(const Page& source_page,
485  const Page& destination_page);
486  void sortAndCopyFilePagesForCompaction(size_t page_size,
487  std::vector<PageMapping>& page_mappings,
488  std::set<Page>& touched_pages);
489  void updateMappedPagesVisibility(const std::vector<PageMapping>& page_mappings);
490  void deleteEmptyFiles();
491  void resumeFileCompaction(const std::string& status_file_name);
492  std::vector<PageMapping> readPageMappingsFromStatusFile();
493 
494  // For testing purposes only
495  FileMgr(const int epoch);
496 
497  void closePhysicalUnlocked();
498  void syncFilesToDisk();
499  void freePages();
500  void initializeNumThreads(size_t num_reader_threads = 0);
501  virtual FileBuffer* allocateBuffer(const size_t page_size,
502  const ChunkKey& key,
503  const size_t num_bytes = 0);
504  virtual FileBuffer* allocateBuffer(
505  const ChunkKey& key,
506  const std::vector<HeaderInfo>::const_iterator& headerStartIt,
507  const std::vector<HeaderInfo>::const_iterator& headerEndIt);
508  virtual ChunkKeyToChunkMap::iterator deleteBufferUnlocked(
509  const ChunkKeyToChunkMap::iterator chunk_it,
510  const bool purge = true);
511  virtual FileBuffer* getBufferUnlocked(const ChunkKey& key,
512  const size_t numBytes = 0) const;
513 
514  private:
515  void init(const size_t num_reader_threads, const int32_t epochOverride);
516  void init(const std::string& dataPathToConvertFrom, const int32_t epochOverride);
517 
518  void rollOffOldData(const int32_t epochCeiling, const bool shouldCheckpoint);
519  void freePagesBeforeEpoch(const int32_t min_epoch);
520  void freePagesBeforeEpochUnlocked(const int32_t min_epoch,
521  const ChunkKeyToChunkMap::iterator lower_bound,
522  const ChunkKeyToChunkMap::iterator upper_bound);
523  FileBuffer* getOrCreateBuffer(const ChunkKey& key);
529  bool coreInit();
530  inline int32_t epoch() const { return static_cast<int32_t>(epoch_.ceiling()); }
531  void writeDirtyBuffers();
532 
533  void setDataAndMetadataFileStats(StorageStats& storage_stats) const;
534  uint32_t getFragmentCount() const;
535 
536  virtual void readOnlyCheck(const std::string& action,
537  const std::optional<std::string>& file_name = {}) const;
538 
541 
543  bool epochIsCheckpointed_ = true;
544  FILE* epochFile_ = nullptr;
545 
546  protected:
547  // gfm_ needs to be defined before the page_size_ because it may be used to
548  // initialize it. However, we also want it to remain private, as the CachingFileMgr
549  // should not be allowed to access it (it will be null). page_size_ should be
550  // be protected.
551  const size_t page_size_;
552  const size_t metadata_page_size_;
553 };
554 
555 } // namespace File_Namespace
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
const size_t metadata_page_size_
Definition: FileMgr.h:552
virtual int32_t epoch(int32_t db_id, int32_t tb_id) const
Returns current value of epoch - should be one greater than recorded at last checkpoint. Because FileMgr only contains buffers from one table we can just return the FileMgr&#39;s epoch instead of finding a table-specific epoch.
Definition: FileMgr.h:281
std::vector< int > ChunkKey
Definition: types.h:36
TablePair fileMgrKey_
Global FileMgr.
Definition: FileMgr.h:540
size_t getPageSize() const
Definition: FileMgr.h:368
virtual bool hasFileMgrKey() const
Definition: FileMgr.h:340
const size_t page_size_
Definition: FileMgr.h:551
std::vector< HeaderInfo > header_infos
Definition: FileMgr.h:128
A logical page (Page) belongs to a file on disk.
Definition: Page.h:46
#define LOG(tag)
Definition: Logger.h:285
std::string printSlabs() override
Definition: FileMgr.h:223
size_t getMaxSize() override
Definition: FileMgr.h:224
std::string getFileMgrBasePath() const
Definition: FileMgr.h:334
GlobalFileMgr * gfm_
Definition: FileMgr.h:539
std::mutex getPageMutex_
pointer to DB level metadata
Definition: FileMgr.h:419
Represents/provides access to contiguous data stored in the file system.
Definition: FileBuffer.h:57
static int64_t max_allowable_epoch()
Definition: Epoch.h:69
MgrType getMgrType() override
Definition: FileMgr.h:221
std::string fileMgrBasePath_
Definition: FileMgr.h:411
std::multimap< size_t, int32_t > PageSizeFileMMap
Maps logical page sizes to files.
Definition: FileMgr.h:68
int32_t lastCheckpointedEpoch() const
Returns value of epoch at last checkpoint.
Definition: FileMgr.h:301
size_t getInUseSize() override
Definition: FileMgr.h:225
static size_t num_pages_per_data_file_
Definition: FileMgr.h:427
This file includes the class specification for the FILE manager (FileMgr), and related data structure...
int32_t PageHeaderSizeType
Definition: FileMgr.h:134
PageMapping(int32_t source_file_id, size_t source_page_num, PageHeaderSizeType source_page_header_size, int32_t destination_file_id, size_t destination_page_num)
Definition: FileMgr.h:139
int32_t fileMgrVersion_
the index of the next file id
Definition: FileMgr.h:417
int32_t incrementEpoch()
Definition: FileMgr.h:285
void init(LogOptions const &log_opts)
Definition: Logger.cpp:364
string version
Definition: setup.in.py:73
ChunkKeyToChunkMap chunkIndex_
Definition: FileMgr.h:330
PageSizeFileMMap fileIndex_
Definition: FileMgr.h:414
constexpr int32_t kDbVersion
DB version for DataMgr DS and corresponding file buffer read/write code.
Definition: FileMgr.h:57
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
std::vector< std::pair< FileInfo *, int32_t > > free_pages_
Definition: FileMgr.h:424
An AbstractBuffer is a unit of data management for a data manager.
size_t num_reader_threads_
Maps page sizes to FileInfo objects.
Definition: FileMgr.h:415
static size_t num_pages_per_metadata_file_
Definition: FileMgr.h:428
std::map< ChunkKey, FileBuffer * > ChunkKeyToChunkMap
Maps ChunkKeys (unique ids for Chunks) to Chunk objects.
Definition: FileMgr.h:92
std::string compaction_status_file_name
Definition: FileMgr.h:130
heavyai::shared_mutex chunkIndexMutex_
Definition: FileMgr.h:420
const TablePair get_fileMgrKey() const
Definition: FileMgr.h:341
heavyai::shared_mutex mutex_free_page_
Definition: FileMgr.h:423
int32_t maxRollbackEpochs_
Definition: FileMgr.h:410
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78
Definition: Epoch.h:30
void checkpoint(const int32_t db_id, const int32_t tb_id) override
Definition: FileMgr.h:272
virtual bool failOnReadError() const
True if a read error should cause a fatal error.
Definition: FileMgr.h:366
std::string getStringMgrType() override
Definition: FileMgr.h:222
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
size_t getAllocated() override
Definition: FileMgr.h:226
This file contains the declaration and definition of a Page type and a MultiPage type.
int32_t epoch() const
Definition: FileMgr.h:530
FileInfo * getFileInfoForFileId(const int32_t fileId) const
Definition: FileMgr.h:229
heavyai::shared_mutex files_rw_mutex_
Definition: FileMgr.h:421
int32_t maxRollbackEpochs()
Returns value max_rollback_epochs.
Definition: FileMgr.h:310
std::shared_timed_mutex shared_mutex
std::pair< const int32_t, const int32_t > TablePair
Definition: FileMgr.h:98
unsigned nextFileId_
number of threads used when loading data
Definition: FileMgr.h:416
int32_t epochFloor() const
Definition: FileMgr.h:283
PageHeaderSizeType source_page_header_size
Definition: FileMgr.h:152
bool isAllocationCapped() override
Definition: FileMgr.h:227
size_t getMetadataPageSize() const
Definition: FileMgr.h:369
size_t getNumReaderThreads()
Returns number of threads defined by parameter num-reader-threads which should be used during initial...
Definition: FileMgr.h:316