OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ForeignStorageCache.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  TODO(Misiu): A lot of methods here can be made asyncronous. It may be worth an
19  investigation to determine if it's worth adding async versions of them for performance
20  reasons.
21 */
22 
23 #include "ForeignStorageCache.h"
24 #include <boost/filesystem.hpp>
25 #include "Shared/File.h"
26 #include "Shared/measure.h"
27 
28 namespace foreign_storage {
31 
32 namespace {
33 template <typename Func, typename T>
35  T& chunk_collection,
36  const ChunkKey& chunk_prefix) {
37  ChunkKey upper_prefix(chunk_prefix);
38  upper_prefix.push_back(std::numeric_limits<int>::max());
39  auto end_it = chunk_collection.upper_bound(static_cast<const ChunkKey>(upper_prefix));
40  for (auto chunk_it = chunk_collection.lower_bound(chunk_prefix); chunk_it != end_it;
41  ++chunk_it) {
42  func(*chunk_it);
43  }
44 }
45 
47  buffer->initEncoder(meta->sqlType);
48  buffer->setSize(meta->numBytes);
49  buffer->getEncoder()->setNumElems(meta->numElements);
50  buffer->getEncoder()->resetChunkStats(meta->chunkStats);
51  buffer->setUpdated();
52 }
53 } // namespace
54 
56  validatePath(config.path);
57  caching_file_mgr_ = std::make_unique<File_Namespace::CachingFileMgr>(config);
58 }
59 
61  caching_file_mgr_->deleteBufferIfExists(chunk_key);
62 }
63 
65  AbstractBuffer* buf,
66  const size_t num_bytes) {
67  caching_file_mgr_->putBuffer(key, buf, num_bytes);
68  CHECK(!buf->isDirty());
69 }
70 
71 void ForeignStorageCache::checkpoint(const int32_t db_id, const int32_t tb_id) {
72  caching_file_mgr_->checkpoint(db_id, tb_id);
73 }
74 
76  const ChunkKey& chunk_key) {
77  auto buf = caching_file_mgr_->getBufferIfExists(chunk_key);
78 
79  if (buf) {
80  if ((*buf)->hasDataPages()) {
81  // 1. If the buffer has data pages then must be in the cache.
82  return *buf;
83  }
84  if (is_varlen_data_key(chunk_key)) {
85  // 2. If the buffer is a varlen data buffer and the
86  // corresponding chunk contains only nulls, then even
87  // without data pages it will still have been cached
88  // if it has a corresponding index buffer which does
89  // have dataPages
90  // Note the empty buffer proviso that the size be 0,
91  // corresponding to all nulls in the chunk
92  auto index_chunk_key = chunk_key;
93  index_chunk_key[CHUNK_KEY_VARLEN_IDX] = 2;
94  auto index_buffer = caching_file_mgr_->getBufferIfExists(index_chunk_key);
95  if (index_buffer && (*index_buffer)->hasDataPages() && (*buf)->size() == 0) {
96  return *buf;
97  }
98  }
99  }
100  // 3. Otherwise this chunk hasn't been cached.
101  return nullptr;
102 }
103 
104 bool ForeignStorageCache::isMetadataCached(const ChunkKey& chunk_key) const {
105  auto buf = caching_file_mgr_->getBufferIfExists(chunk_key);
106  if (buf) {
107  return (*buf)->hasEncoder();
108  }
109  return false;
110 }
111 
113  auto timer = DEBUG_TIMER(__func__);
114  if (metadata_vec.empty()) {
115  return;
116  }
117  auto first_chunk_key = metadata_vec.begin()->first;
118  for (auto& [chunk_key, metadata] : metadata_vec) {
119  CHECK(in_same_table(chunk_key, first_chunk_key));
120  AbstractBuffer* buf;
121  AbstractBuffer* index_buffer = nullptr;
122  ChunkKey index_chunk_key;
123  if (is_varlen_key(chunk_key)) {
124  // For variable length chunks, metadata is associated with the data chunk.
125  CHECK(is_varlen_data_key(chunk_key));
126  index_chunk_key = {chunk_key[CHUNK_KEY_DB_IDX],
127  chunk_key[CHUNK_KEY_TABLE_IDX],
128  chunk_key[CHUNK_KEY_COLUMN_IDX],
129  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
130  2};
131  }
132  bool chunk_in_cache = false;
133  if (!caching_file_mgr_->isBufferOnDevice(chunk_key)) {
134  buf = caching_file_mgr_->createBuffer(chunk_key);
135 
136  if (!index_chunk_key.empty()) {
137  CHECK(!caching_file_mgr_->isBufferOnDevice(index_chunk_key));
138  index_buffer = caching_file_mgr_->createBuffer(index_chunk_key);
139  CHECK(index_buffer);
140  }
141  } else {
142  buf = caching_file_mgr_->getBuffer(chunk_key);
143 
144  if (!index_chunk_key.empty()) {
145  CHECK(caching_file_mgr_->isBufferOnDevice(index_chunk_key));
146  index_buffer = caching_file_mgr_->getBuffer(index_chunk_key);
147  CHECK(index_buffer);
148  }
149 
150  // We should have already cleared the data unless we are appending
151  // If the buffer metadata has changed, we need to remove this chunk
152  if (buf->getEncoder() != nullptr) {
153  const std::shared_ptr<ChunkMetadata> buf_metadata =
154  std::make_shared<ChunkMetadata>();
155  buf->getEncoder()->getMetadata(buf_metadata);
156  chunk_in_cache = *metadata.get() == *buf_metadata;
157  }
158  }
159 
160  if (!chunk_in_cache) {
161  set_metadata_for_buffer(buf, metadata.get());
162  eraseChunk(chunk_key);
163 
164  if (!index_chunk_key.empty()) {
165  CHECK(index_buffer);
166  index_buffer->setUpdated();
167  eraseChunk(index_chunk_key);
168  }
169  }
170  }
171  caching_file_mgr_->checkpoint(first_chunk_key[CHUNK_KEY_DB_IDX],
172  first_chunk_key[CHUNK_KEY_TABLE_IDX]);
173 }
174 
176  ChunkMetadataVector& metadata_vec,
177  const ChunkKey& chunk_prefix) const {
178  caching_file_mgr_->getChunkMetadataVecForKeyPrefix(metadata_vec, chunk_prefix);
179 }
180 
182  const ChunkKey& chunk_prefix) const {
183  return caching_file_mgr_->hasChunkMetadataForKeyPrefix(chunk_prefix);
184 }
185 
187  CHECK(is_table_key(chunk_prefix));
188  auto timer = DEBUG_TIMER(__func__);
189  caching_file_mgr_->clearForTable(chunk_prefix[CHUNK_KEY_DB_IDX],
190  chunk_prefix[CHUNK_KEY_TABLE_IDX]);
191 }
192 
194  auto timer = DEBUG_TIMER(__func__);
195  // FileMgrs do not clean up after themselves nicely, so we need to close all their disk
196  // resources and then re-create the CachingFileMgr to reset it.
197  caching_file_mgr_->closeRemovePhysical();
198  boost::filesystem::create_directory(caching_file_mgr_->getFileMgrBasePath());
199  caching_file_mgr_ = caching_file_mgr_->reconstruct();
200 }
201 
203  const ChunkKey& chunk_prefix) const {
204  return caching_file_mgr_->getChunkKeysForPrefix(chunk_prefix);
205 }
206 
208  caching_file_mgr_->removeChunkKeepMetadata(chunk_key);
209 }
210 
212  return caching_file_mgr_->dumpKeysWithChunkData();
213 }
214 
216  return caching_file_mgr_->dumpKeysWithMetadata();
217 }
218 
220  return caching_file_mgr_->dumpEvictionQueue();
221 }
222 
223 void ForeignStorageCache::validatePath(const std::string& base_path) const {
224  // check if base_path already exists, and if not create one
225  boost::filesystem::path path(base_path);
226  if (boost::filesystem::exists(path)) {
227  if (!boost::filesystem::is_directory(path)) {
228  throw std::runtime_error{
229  "cache path \"" + base_path +
230  "\" is not a directory. Please specify a valid directory "
231  "with --disk_cache_path=<path>, or use the default location."};
232  }
233  } else { // data directory does not exist
234  if (!boost::filesystem::create_directory(path)) {
235  throw std::runtime_error{
236  "could not create directory at cache path \"" + base_path +
237  "\". Please specify a valid directory location "
238  "with --disk_cache_path=<path> or use the default location."};
239  }
240  }
241 }
242 
244  const std::set<ChunkKey>& keys) const {
245  ChunkToBufferMap chunk_buffer_map;
246  for (const auto& key : keys) {
247  CHECK(caching_file_mgr_->isBufferOnDevice(key));
248  chunk_buffer_map[key] = caching_file_mgr_->getBuffer(key);
249  auto file_buf = dynamic_cast<File_Namespace::FileBuffer*>(chunk_buffer_map[key]);
250  CHECK(file_buf);
251  CHECK(!file_buf->hasDataPages());
252 
253  // Clear all buffer metadata
254  file_buf->resetToEmpty();
255  }
256  return chunk_buffer_map;
257 }
258 
260  const ChunkKey& chunk_key,
261  bool is_new_buffer) {
262  if (!is_new_buffer) {
263  CHECK(caching_file_mgr_->isBufferOnDevice(chunk_key));
264  return caching_file_mgr_->getBuffer(chunk_key);
265  } else {
266  CHECK(!caching_file_mgr_->isBufferOnDevice(chunk_key));
267  return caching_file_mgr_->createBuffer(chunk_key);
268  }
269 }
270 
271 void ForeignStorageCache::storeDataWrapper(const std::string& doc,
272  int32_t db_id,
273  int32_t tb_id) {
274  caching_file_mgr_->writeWrapperFile(doc, db_id, tb_id);
275 }
276 
278  int32_t table_id) const {
279  return caching_file_mgr_->hasWrapperFile(db_id, table_id);
280 }
281 
282 } // namespace foreign_storage
std::vector< int > ChunkKey
Definition: types.h:36
bool isMetadataCached(const ChunkKey &) const
bool is_table_key(const ChunkKey &key)
Definition: types.h:44
heavyai::shared_lock< heavyai::shared_mutex > read_lock
bool is_varlen_data_key(const ChunkKey &key)
Definition: types.h:75
void storeDataWrapper(const std::string &doc, int32_t db_id, int32_t tb_id)
#define CHUNK_KEY_DB_IDX
Definition: types.h:38
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:41
std::map< ChunkKey, AbstractBuffer * > ChunkToBufferMap
heavyai::unique_lock< heavyai::shared_mutex > write_lock
This file includes the class specification for the cache used by the Foreign Storage Interface (FSI)...
Represents/provides access to contiguous data stored in the file system.
Definition: FileBuffer.h:57
void initEncoder(const SQLTypeInfo &tmp_sql_type)
void setNumElems(const size_t num_elems)
Definition: Encoder.h:285
ChunkStats chunkStats
Definition: ChunkMetadata.h:37
virtual bool resetChunkStats(const ChunkStats &)
: Reset chunk level stats (min, max, nulls) using new values from the argument.
Definition: Encoder.h:274
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:231
std::shared_lock< T > shared_lock
void getCachedMetadataVecForKeyPrefix(ChunkMetadataVector &, const ChunkKey &) const
bool hasStoredDataWrapperMetadata(int32_t db_id, int32_t table_id) const
std::unique_lock< T > unique_lock
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:39
void iterate_over_matching_prefix(Func func, T &chunk_collection, const ChunkKey &chunk_prefix)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
An AbstractBuffer is a unit of data management for a data manager.
ChunkToBufferMap getChunkBuffersForCaching(const std::set< ChunkKey > &chunk_keys) const
void putBuffer(const ChunkKey &, AbstractBuffer *, const size_t numBytes=0)
void cacheMetadataVec(const ChunkMetadataVector &)
std::set< ChunkKey >::iterator eraseChunk(const std::set< ChunkKey >::iterator &)
std::unique_ptr< File_Namespace::CachingFileMgr > caching_file_mgr_
void deleteBufferIfExists(const ChunkKey &chunk_key)
void set_metadata_for_buffer(AbstractBuffer *buffer, ChunkMetadata *meta)
void validatePath(const std::string &) const
#define CHUNK_KEY_VARLEN_IDX
Definition: types.h:42
std::vector< ChunkKey > getCachedChunksForKeyPrefix(const ChunkKey &) const
AbstractBuffer * getChunkBufferForPrecaching(const ChunkKey &chunk_key, bool is_new_buffer)
void setSize(const size_t size)
void checkpoint(const int32_t db_id, const int32_t tb_id)
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:411
File_Namespace::FileBuffer * getCachedChunkIfExists(const ChunkKey &)
bool hasCachedMetadataForKeyPrefix(const ChunkKey &) const
ForeignStorageCache(const File_Namespace::DiskCacheConfig &config)
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:40
bool in_same_table(const ChunkKey &left_key, const ChunkKey &right_key)
Definition: types.h:83
SQLTypeInfo sqlType
Definition: ChunkMetadata.h:34
A selection of helper methods for File I/O.
bool is_varlen_key(const ChunkKey &key)
Definition: types.h:71
size_t numElements
Definition: ChunkMetadata.h:36