OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CachingForeignStorageMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 #include "Catalog/ForeignTable.h"
20 #include "ForeignTableSchema.h"
21 #ifdef ENABLE_IMPORT_PARQUET
22 #include "ParquetDataWrapper.h"
23 #endif
24 
25 namespace foreign_storage {
26 
27 namespace {
28 constexpr int64_t MAX_REFRESH_TIME_IN_SECONDS = 60 * 60;
29 
30 inline bool is_system_table_chunk_key(const ChunkKey& chunk_key) {
31  CHECK(has_table_prefix(chunk_key));
32  auto catalog =
34  CHECK(catalog);
35  auto table = catalog->getForeignTable(chunk_key[CHUNK_KEY_TABLE_IDX]);
36  CHECK(table);
37  return table->is_system_table;
38 }
39 } // namespace
40 
42  : ForeignStorageMgr(), disk_cache_(cache) {
44 }
45 
47  ForeignDataWrapper& data_wrapper,
48  ChunkToBufferMap& required_buffers,
49  ChunkToBufferMap& optional_buffers) {
50  CHECK_GT(required_buffers.size(), 0U) << "Must populate at least one buffer";
51  try {
52  data_wrapper.populateChunkBuffers(required_buffers, optional_buffers);
53  } catch (const std::runtime_error& error) {
54  // clear any partially loaded but failed chunks (there may be some
55  // fully-loaded chunks as well but they will be cleared conservatively
56  // anyways)
57  for (const auto& [chunk_key, buffer] : required_buffers) {
58  if (auto file_buffer = dynamic_cast<File_Namespace::FileBuffer*>(buffer)) {
59  file_buffer->freeChunkPages();
60  }
61  }
62  for (const auto& [chunk_key, buffer] : optional_buffers) {
63  if (auto file_buffer = dynamic_cast<File_Namespace::FileBuffer*>(buffer)) {
64  file_buffer->freeChunkPages();
65  }
66  }
67 
68  throw ForeignStorageException(error.what());
69  }
70  // All required buffers should be from the same table.
71  auto [db, tb] = get_table_prefix(required_buffers.begin()->first);
72  disk_cache_->checkpoint(db, tb);
73 }
74 
76  AbstractBuffer* destination_buffer,
77  const size_t num_bytes) {
78  if (is_system_table_chunk_key(chunk_key)) {
79  ForeignStorageMgr::fetchBuffer(chunk_key, destination_buffer, num_bytes);
80  return;
81  }
82  CHECK(destination_buffer);
83  CHECK(!destination_buffer->isDirty());
84 
86  if (buffer) {
87  buffer->copyTo(destination_buffer, num_bytes);
88  return;
89  } else {
90  std::vector<ChunkKey> chunk_keys = get_keys_vec_from_table(chunk_key);
91  std::vector<ChunkKey> optional_keys;
92  ChunkToBufferMap optional_buffers;
93 
94  // Use hints to prefetch other chunks in fragment into cache
95  auto& data_wrapper = *getDataWrapper(chunk_key);
96  std::set<ChunkKey> optional_set;
97  getOptionalChunkKeySet(optional_set,
98  chunk_key,
99  get_keys_set_from_table(chunk_key),
100  data_wrapper.getCachedParallelismLevel());
101  for (const auto& key : optional_set) {
102  if (disk_cache_->getCachedChunkIfExists(key) == nullptr) {
103  optional_keys.emplace_back(key);
104  }
105  }
106 
107  if (optional_keys.size()) {
108  optional_buffers = disk_cache_->getChunkBuffersForCaching(optional_keys);
109  }
110 
111  ChunkToBufferMap required_buffers =
113  CHECK(required_buffers.find(chunk_key) != required_buffers.end());
114  populateChunkBuffersSafely(data_wrapper, required_buffers, optional_buffers);
115 
116  AbstractBuffer* buffer = required_buffers.at(chunk_key);
117  CHECK(buffer);
118 
119  buffer->copyTo(destination_buffer, num_bytes);
120  }
121 }
122 
124  ChunkMetadataVector& chunk_metadata,
125  const ChunkKey& key_prefix) {
126  if (is_system_table_chunk_key(key_prefix)) {
127  ForeignStorageMgr::getChunkMetadataVecForKeyPrefix(chunk_metadata, key_prefix);
128  return;
129  }
130  CHECK(has_table_prefix(key_prefix));
131  // If the disk has any cached metadata for a prefix then it is guaranteed to have all
132  // metadata for that table, so we can return a complete set. If it has no metadata,
133  // then it may be that the table has no data, or that it's just not cached, so we need
134  // to go to storage to check.
135  if (disk_cache_->hasCachedMetadataForKeyPrefix(key_prefix)) {
136  disk_cache_->getCachedMetadataVecForKeyPrefix(chunk_metadata, key_prefix);
137  createDataWrapperIfNotExists(key_prefix);
138  return;
139  }
140  getChunkMetadataVecFromDataWrapper(chunk_metadata, key_prefix);
141  disk_cache_->cacheMetadataVec(chunk_metadata);
142 }
143 
145  ChunkMetadataVector& chunk_metadata,
146  const ChunkKey& chunk_key_prefix) {
147  CHECK(has_table_prefix(chunk_key_prefix));
148  auto [db_id, tb_id] = get_table_prefix(chunk_key_prefix);
149  try {
150  ForeignStorageMgr::getChunkMetadataVecForKeyPrefix(chunk_metadata, chunk_key_prefix);
151  } catch (...) {
152  clearTable({db_id, tb_id});
153  throw;
154  }
155  auto doc = getDataWrapper(chunk_key_prefix)->getSerializedDataWrapper();
156  disk_cache_->storeDataWrapper(doc, db_id, tb_id);
157 
158  // If the wrapper populated buffers we want that action to be checkpointed.
159  disk_cache_->checkpoint(db_id, tb_id);
160 }
161 
163  const bool evict_cached_entries) {
164  CHECK(is_table_key(table_key));
167  if (evict_cached_entries) {
168  clearTable(table_key);
169  } else {
170  refreshTableInCache(table_key);
171  }
172 }
173 
175  CHECK(is_table_key(table_key));
176 
177  // Preserve the list of which chunks were cached per table to refresh after clear.
178  std::vector<ChunkKey> old_chunk_keys =
180  auto catalog =
182  CHECK(catalog);
183  bool append_mode =
184  catalog->getForeignTable(table_key[CHUNK_KEY_TABLE_IDX])->isAppendMode();
185 
186  append_mode ? refreshAppendTableInCache(table_key, old_chunk_keys)
187  : refreshNonAppendTableInCache(table_key, old_chunk_keys);
188 }
189 
191  disk_cache_->clearForTablePrefix(table_key);
193 }
194 
196  // Determine last fragment ID
197  int last_frag_id = 0;
198  if (disk_cache_->hasCachedMetadataForKeyPrefix(table_key)) {
199  ChunkMetadataVector cached_metadata;
200  disk_cache_->getCachedMetadataVecForKeyPrefix(cached_metadata, table_key);
201  for (const auto& [key, metadata] : cached_metadata) {
202  last_frag_id = std::max(last_frag_id, key[CHUNK_KEY_FRAGMENT_IDX]);
203  }
204  }
205  return last_frag_id;
206 }
207 
209  const ChunkKey& table_key,
210  const std::vector<ChunkKey>& old_chunk_keys) {
211  CHECK(is_table_key(table_key));
212  int last_frag_id = getHighestCachedFragId(table_key);
213 
214  ChunkMetadataVector storage_metadata;
215  getChunkMetadataVecFromDataWrapper(storage_metadata, table_key);
216  try {
217  disk_cache_->cacheMetadataWithFragIdGreaterOrEqualTo(storage_metadata, last_frag_id);
218  refreshChunksInCacheByFragment(old_chunk_keys, last_frag_id);
219  } catch (std::runtime_error& e) {
221  }
222 }
223 
225  const ChunkKey& table_key,
226  const std::vector<ChunkKey>& old_chunk_keys) {
227  CHECK(is_table_key(table_key));
228  ChunkMetadataVector storage_metadata;
229  clearTable(table_key);
230  getChunkMetadataVecFromDataWrapper(storage_metadata, table_key);
231 
232  try {
233  disk_cache_->cacheMetadataVec(storage_metadata);
234  refreshChunksInCacheByFragment(old_chunk_keys, 0);
235  } catch (std::runtime_error& e) {
237  }
238 }
239 
241  const std::vector<ChunkKey>& old_chunk_keys,
242  int start_frag_id) {
243  int64_t total_time{0};
244  auto fragment_refresh_start_time = std::chrono::high_resolution_clock::now();
245 
246  if (old_chunk_keys.empty()) {
247  return;
248  }
249  // Iterate through previously cached chunks and re-cache them. Caching is
250  // done one fragment at a time, for all applicable chunks in the fragment.
251  ChunkToBufferMap optional_buffers;
252  std::vector<ChunkKey> chunk_keys_to_be_cached;
253  auto fragment_id = old_chunk_keys[0][CHUNK_KEY_FRAGMENT_IDX];
254  const ChunkKey table_key{get_table_key(old_chunk_keys[0])};
255  std::vector<ChunkKey> chunk_keys_in_fragment;
256  for (const auto& chunk_key : old_chunk_keys) {
257  CHECK(chunk_key[CHUNK_KEY_TABLE_IDX] == table_key[CHUNK_KEY_TABLE_IDX]);
258  if (chunk_key[CHUNK_KEY_FRAGMENT_IDX] < start_frag_id) {
259  continue;
260  }
261  if (disk_cache_->isMetadataCached(chunk_key)) {
262  if (chunk_key[CHUNK_KEY_FRAGMENT_IDX] != fragment_id) {
263  if (chunk_keys_in_fragment.size() > 0) {
264  auto required_buffers =
265  disk_cache_->getChunkBuffersForCaching(chunk_keys_in_fragment);
267  *getDataWrapper(table_key), required_buffers, optional_buffers);
268  chunk_keys_in_fragment.clear();
269  }
270  // At this point, cache buffers for refreshable chunks in the last fragment
271  // have been populated. Exit if the max refresh time has been exceeded.
272  // Otherwise, move to the next fragment.
273  auto current_time = std::chrono::high_resolution_clock::now();
274  total_time += std::chrono::duration_cast<std::chrono::seconds>(
275  current_time - fragment_refresh_start_time)
276  .count();
277  if (total_time >= MAX_REFRESH_TIME_IN_SECONDS) {
278  LOG(WARNING) << "Refresh time exceeded for table key: { " << table_key[0]
279  << ", " << table_key[1] << " } after fragment id: " << fragment_id;
280  break;
281  } else {
282  fragment_refresh_start_time = std::chrono::high_resolution_clock::now();
283  }
284  fragment_id = chunk_key[CHUNK_KEY_FRAGMENT_IDX];
285  }
286  // Key may have been cached during scan
287  if (disk_cache_->getCachedChunkIfExists(chunk_key) == nullptr) {
288  if (is_varlen_key(chunk_key)) {
289  CHECK(is_varlen_data_key(chunk_key));
290  ChunkKey index_chunk_key{chunk_key[CHUNK_KEY_DB_IDX],
291  chunk_key[CHUNK_KEY_TABLE_IDX],
292  chunk_key[CHUNK_KEY_COLUMN_IDX],
293  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
294  2};
295  chunk_keys_in_fragment.emplace_back(index_chunk_key);
296  chunk_keys_to_be_cached.emplace_back(index_chunk_key);
297  }
298  chunk_keys_in_fragment.emplace_back(chunk_key);
299  chunk_keys_to_be_cached.emplace_back(chunk_key);
300  }
301  }
302  }
303  if (chunk_keys_in_fragment.size() > 0) {
304  auto required_buffers =
305  disk_cache_->getChunkBuffersForCaching(chunk_keys_in_fragment);
307  *getDataWrapper(table_key), required_buffers, optional_buffers);
308  }
309 }
310 
312  std::lock_guard data_wrapper_lock(data_wrapper_mutex_);
313  ChunkKey table_key = get_table_key(chunk_key);
314  auto data_wrapper_it = data_wrapper_map_.find(table_key);
315  if (data_wrapper_it != data_wrapper_map_.end()) {
316  return false;
317  }
318  auto [db, tb] = get_table_prefix(chunk_key);
320  auto wrapper_file = disk_cache_->getSerializedWrapperPath(db, tb);
321  if (boost::filesystem::exists(wrapper_file)) {
322  ChunkMetadataVector chunk_metadata;
323  disk_cache_->getCachedMetadataVecForKeyPrefix(chunk_metadata, table_key);
324  data_wrapper_map_.at(table_key)->restoreDataWrapperInternals(
325  disk_cache_->getSerializedWrapperPath(db, tb), chunk_metadata);
326  }
327  return true;
328 }
329 
330 void CachingForeignStorageMgr::removeTableRelatedDS(const int db_id, const int table_id) {
331  disk_cache_->clearForTablePrefix({db_id, table_id});
333 }
334 
335 } // namespace foreign_storage
std::vector< ChunkKey > get_keys_vec_from_table(const ChunkKey &destination_chunk_key)
void refreshAppendTableInCache(const ChunkKey &table_key, const std::vector< ChunkKey > &old_chunk_keys)
std::vector< int > ChunkKey
Definition: types.h:37
bool isMetadataCached(const ChunkKey &) const
std::shared_ptr< ForeignDataWrapper > getDataWrapper(const ChunkKey &chunk_key)
std::set< ChunkKey > get_keys_set_from_table(const ChunkKey &destination_chunk_key)
bool is_table_key(const ChunkKey &key)
Definition: types.h:45
bool is_varlen_data_key(const ChunkKey &key)
Definition: types.h:71
#define LOG(tag)
Definition: Logger.h:203
void storeDataWrapper(const std::string &doc, int32_t db_id, int32_t tb_id)
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
static void checkIfS3NeedsToBeEnabled(const ChunkKey &chunk_key)
std::map< ChunkKey, AbstractBuffer * > ChunkToBufferMap
void removeTableRelatedDS(const int db_id, const int table_id) override
#define CHECK_GT(x, y)
Definition: Logger.h:221
void refreshTableInCache(const ChunkKey &table_key)
ChunkKey get_table_key(const ChunkKey &key)
Definition: types.h:53
void getChunkMetadataVecFromDataWrapper(ChunkMetadataVector &chunk_metadata, const ChunkKey &chunk_key_prefix)
void refreshNonAppendTableInCache(const ChunkKey &table_key, const std::vector< ChunkKey > &old_chunk_keys)
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunk_metadata, const ChunkKey &chunk_key_prefix) override
void getCachedMetadataVecForKeyPrefix(ChunkMetadataVector &, const ChunkKey &) const
static SysCatalog & instance()
Definition: SysCatalog.h:325
void getOptionalChunkKeySet(std::set< ChunkKey > &optional_chunk_keys, const ChunkKey &chunk_key, const std::set< ChunkKey > &required_chunk_keys, const ForeignDataWrapper::ParallelismLevel parallelism_level)
int count
void refreshChunksInCacheByFragment(const std::vector< ChunkKey > &old_chunk_keys, int last_frag_id)
bool createDataWrapperIfNotExists(const ChunkKey &chunk_key) override
void fetchBuffer(const ChunkKey &chunk_key, AbstractBuffer *destination_buffer, const size_t num_bytes) override
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
bool has_table_prefix(const ChunkKey &key)
Definition: types.h:49
An AbstractBuffer is a unit of data management for a data manager.
void cacheMetadataWithFragIdGreaterOrEqualTo(const ChunkMetadataVector &metadata_vec, const int frag_id)
void populateChunkBuffersSafely(ForeignDataWrapper &data_wrapper, ChunkToBufferMap &required_buffers, ChunkToBufferMap &optional_buffers)
void refreshTable(const ChunkKey &table_key, const bool evict_cached_entries) override
std::string getSerializedWrapperPath(int32_t db_id, int32_t tb_id) const
std::map< ChunkKey, std::shared_ptr< ForeignDataWrapper > > data_wrapper_map_
void cacheMetadataVec(const ChunkMetadataVector &)
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
void clearDataWrapper(const ChunkKey &table_key)
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunk_metadata, const ChunkKey &chunk_key_prefix) override
virtual void populateChunkBuffers(const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers)=0
std::vector< ChunkKey > getCachedChunksForKeyPrefix(const ChunkKey &) const
void copyTo(AbstractBuffer *destination_buffer, const size_t num_bytes=0)
std::pair< int, int > get_table_prefix(const ChunkKey &key)
Definition: types.h:58
void checkpoint(const int32_t db_id, const int32_t tb_id)
void createDataWrapperUnlocked(int32_t db, int32_t tb)
#define CHECK(condition)
Definition: Logger.h:209
File_Namespace::FileBuffer * getCachedChunkIfExists(const ChunkKey &)
bool hasCachedMetadataForKeyPrefix(const ChunkKey &) const
void clearTempChunkBufferMapEntriesForTable(const ChunkKey &table_key)
ChunkToBufferMap getChunkBuffersForCaching(const std::vector< ChunkKey > &chunk_keys) const
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41
void fetchBuffer(const ChunkKey &chunk_key, AbstractBuffer *destination_buffer, const size_t num_bytes) override
void removeTableRelatedDS(const int db_id, const int table_id) override
bool is_varlen_key(const ChunkKey &key)
Definition: types.h:67
int getHighestCachedFragId(const ChunkKey &table_key)