OmniSciDB  2e3a973ef4
CachingForeignStorageMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #include "Catalog/ForeignTable.h"
20 #include "CsvDataWrapper.h"
21 #include "ForeignTableSchema.h"
22 #include "ParquetDataWrapper.h"
23 
24 namespace foreign_storage {
25 
26 namespace {
27 constexpr int64_t MAX_REFRESH_TIME_IN_SECONDS = 60 * 60;
28 const std::string wrapper_file_name = "/wrapper_metadata.json";
29 } // namespace
30 
32  : ForeignStorageMgr(), disk_cache_(cache) {
34 }
35 
37  AbstractBuffer* destination_buffer,
38  const size_t num_bytes) {
39  CHECK(destination_buffer);
40  CHECK(!destination_buffer->isDirty());
41 
43 
44  // TODO: Populate optional buffers as part of CSV performance improvement
45  std::vector<ChunkKey> chunk_keys = get_keys_vec_from_table(chunk_key);
46  std::map<ChunkKey, AbstractBuffer*> optional_buffers;
47  std::map<ChunkKey, AbstractBuffer*> required_buffers =
49  CHECK(required_buffers.find(chunk_key) != required_buffers.end());
50  getDataWrapper(chunk_key)->populateChunkBuffers(required_buffers, optional_buffers);
51  disk_cache_->cacheTableChunks(chunk_keys);
52 
53  AbstractBuffer* buffer = required_buffers.at(chunk_key);
54  CHECK(buffer);
55 
56  buffer->copyTo(destination_buffer, num_bytes);
57 }
58 
60  ChunkMetadataVector& chunk_metadata,
61  const ChunkKey& keyPrefix) {
62  ForeignStorageMgr::getChunkMetadataVecForKeyPrefix(chunk_metadata, keyPrefix);
63  getDataWrapper(keyPrefix)->serializeDataWrapperInternals(
65 }
66 
68  const ChunkKey& table_key,
69  const ChunkMetadataVector& chunk_metadata) {
70  getDataWrapper(table_key)->restoreDataWrapperInternals(
72  chunk_metadata);
73 }
74 
76  const bool evict_cached_entries) {
77  CHECK(is_table_key(table_key));
79  evict_cached_entries ? disk_cache_->clearForTablePrefix(table_key)
80  : refreshTableInCache(table_key);
81 }
82 
84  CHECK(is_table_key(table_key));
85 
86  // Before we can refresh a table we should make sure it has recovered any data
87  // if the table has been unused since the last server restart.
88  if (!disk_cache_->hasCachedMetadataForKeyPrefix(table_key)) {
89  ChunkMetadataVector old_cached_metadata;
90  disk_cache_->recoverCacheForTable(old_cached_metadata, table_key);
91  }
92 
93  // Preserve the list of which chunks were cached per table to refresh after clear.
94  std::vector<ChunkKey> old_chunk_keys =
96 
97  bool append_mode = Catalog_Namespace::Catalog::checkedGet(table_key[CHUNK_KEY_DB_IDX])
98  ->getForeignTableUnlocked(table_key[CHUNK_KEY_TABLE_IDX])
99  ->isAppendMode();
100 
101  append_mode ? refreshAppendTableInCache(table_key, old_chunk_keys)
102  : refreshNonAppendTableInCache(table_key, old_chunk_keys);
103 }
104 
106  // Determine last fragment ID
107  int last_frag_id = 0;
108  if (disk_cache_->hasCachedMetadataForKeyPrefix(table_key)) {
109  ChunkMetadataVector cached_metadata;
110  disk_cache_->getCachedMetadataVecForKeyPrefix(cached_metadata, table_key);
111  for (const auto& [key, metadata] : cached_metadata) {
112  last_frag_id = std::max(last_frag_id, key[CHUNK_KEY_FRAGMENT_IDX]);
113  }
114  }
115  return last_frag_id;
116 }
117 
119  const ChunkKey& table_key,
120  const std::vector<ChunkKey>& old_chunk_keys) {
121  CHECK(is_table_key(table_key));
123  int last_frag_id = getHighestCachedFragId(table_key);
124 
125  ChunkMetadataVector storage_metadata;
126  getChunkMetadataVecForKeyPrefix(storage_metadata, table_key);
127  try {
128  disk_cache_->cacheMetadataWithFragIdGreaterOrEqualTo(storage_metadata, last_frag_id);
129  refreshChunksInCacheByFragment(old_chunk_keys, last_frag_id);
130  } catch (std::runtime_error& e) {
132  }
133 }
134 
136  const ChunkKey& table_key,
137  const std::vector<ChunkKey>& old_chunk_keys) {
138  CHECK(is_table_key(table_key));
139  // Getting metadata from (foreign) storage could throw if we have lost our connnection.
140  // Therefore we only want clear the cache and refresh after we have confirmed that we
141  // can get new data from storage, if we can't reach storage then throwing here will
142  // leave the cache unchanged.
143  ChunkMetadataVector storage_metadata;
144  getChunkMetadataVecForKeyPrefix(storage_metadata, table_key);
145  disk_cache_->clearForTablePrefix(table_key);
146  try {
147  disk_cache_->cacheMetadataVec(storage_metadata);
148  refreshChunksInCacheByFragment(old_chunk_keys, 0);
149  } catch (std::runtime_error& e) {
151  }
152 }
153 
155  const std::vector<ChunkKey>& old_chunk_keys,
156  int start_frag_id) {
157  int64_t total_time{0};
158  auto fragment_refresh_start_time = std::chrono::high_resolution_clock::now();
159 
160  if (old_chunk_keys.empty()) {
161  return;
162  }
163  // Iterate through previously cached chunks and re-cache them. Caching is
164  // done one fragment at a time, for all applicable chunks in the fragment.
165  std::map<ChunkKey, AbstractBuffer*> optional_buffers;
166  std::vector<ChunkKey> chunk_keys_to_be_cached;
167  auto fragment_id = old_chunk_keys[0][CHUNK_KEY_FRAGMENT_IDX];
168  const ChunkKey table_key{get_table_key(old_chunk_keys[0])};
169  std::vector<ChunkKey> chunk_keys_in_fragment;
170  for (const auto& chunk_key : old_chunk_keys) {
171  if (chunk_key[CHUNK_KEY_FRAGMENT_IDX] < start_frag_id) {
172  continue;
173  }
174  if (disk_cache_->isMetadataCached(chunk_key)) {
175  if (chunk_key[CHUNK_KEY_FRAGMENT_IDX] != fragment_id) {
176  if (chunk_keys_in_fragment.size() > 0) {
177  auto required_buffers =
178  disk_cache_->getChunkBuffersForCaching(chunk_keys_in_fragment);
179  getDataWrapper(table_key)->populateChunkBuffers(required_buffers,
180  optional_buffers);
181  chunk_keys_in_fragment.clear();
182  }
183  // At this point, cache buffers for refreshable chunks in the last fragment
184  // have been populated. Exit if the max refresh time has been exceeded.
185  // Otherwise, move to the next fragment.
186  auto current_time = std::chrono::high_resolution_clock::now();
187  total_time += std::chrono::duration_cast<std::chrono::seconds>(
188  current_time - fragment_refresh_start_time)
189  .count();
190  if (total_time >= MAX_REFRESH_TIME_IN_SECONDS) {
191  LOG(WARNING) << "Refresh time exceeded for table key: { " << table_key[0]
192  << ", " << table_key[1] << " } after fragment id: " << fragment_id;
193  break;
194  } else {
195  fragment_refresh_start_time = std::chrono::high_resolution_clock::now();
196  }
197  fragment_id = chunk_key[CHUNK_KEY_FRAGMENT_IDX];
198  }
199  if (is_varlen_key(chunk_key)) {
200  CHECK(is_varlen_data_key(chunk_key));
201  ChunkKey index_chunk_key{chunk_key[CHUNK_KEY_DB_IDX],
202  chunk_key[CHUNK_KEY_TABLE_IDX],
203  chunk_key[CHUNK_KEY_COLUMN_IDX],
204  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
205  2};
206  chunk_keys_in_fragment.emplace_back(index_chunk_key);
207  chunk_keys_to_be_cached.emplace_back(index_chunk_key);
208  }
209  chunk_keys_in_fragment.emplace_back(chunk_key);
210  chunk_keys_to_be_cached.emplace_back(chunk_key);
211  }
212  }
213  if (chunk_keys_in_fragment.size() > 0) {
214  auto required_buffers =
215  disk_cache_->getChunkBuffersForCaching(chunk_keys_in_fragment);
216  getDataWrapper(table_key)->populateChunkBuffers(required_buffers, optional_buffers);
217  }
218  disk_cache_->cacheTableChunks(chunk_keys_to_be_cached);
219 }
220 
222  const ChunkKey& chunk_key) {
223  ChunkKey table_key = get_table_key(chunk_key);
224  if (createDataWrapperIfNotExists(table_key)) {
225  ChunkMetadataVector chunk_metadata;
226  if (disk_cache_->hasCachedMetadataForKeyPrefix(table_key)) {
227  disk_cache_->getCachedMetadataVecForKeyPrefix(chunk_metadata, table_key);
228  recoverDataWrapperFromDisk(table_key, chunk_metadata);
229  } else {
230  getDataWrapper(table_key)->populateChunkMetadata(chunk_metadata);
231  }
232  }
233 }
234 
235 } // namespace foreign_storage
void refreshAppendTableInCache(const ChunkKey &table_key, const std::vector< ChunkKey > &old_chunk_keys)
std::vector< ChunkKey > getCachedChunksForKeyPrefix(const ChunkKey &) const
#define LOG(tag)
Definition: Logger.h:188
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
bool is_table_key(const ChunkKey &key)
Definition: types.h:44
bool hasCachedMetadataForKeyPrefix(const ChunkKey &) const
void cacheTableChunks(const std::vector< ChunkKey > &chunk_keys)
void refreshTableInCache(const ChunkKey &table_key)
void refreshNonAppendTableInCache(const ChunkKey &table_key, const std::vector< ChunkKey > &old_chunk_keys)
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunk_metadata, const ChunkKey &chunk_key_prefix) override
std::string getCacheDirectoryForTablePrefix(const ChunkKey &) const
void createOrRecoverDataWrapperIfNotExists(const ChunkKey &chunk_key)
std::map< ChunkKey, AbstractBuffer * > getChunkBuffersForCaching(const std::vector< ChunkKey > &chunk_keys) const
std::shared_ptr< ForeignDataWrapper > getDataWrapper(const ChunkKey &chunk_key)
void refreshChunksInCacheByFragment(const std::vector< ChunkKey > &old_chunk_keys, int last_frag_id)
void recoverDataWrapperFromDisk(const ChunkKey &table_key, const ChunkMetadataVector &chunk_metadata)
bool is_varlen_key(const ChunkKey &key)
Definition: types.h:61
An AbstractBuffer is a unit of data management for a data manager.
void cacheMetadataWithFragIdGreaterOrEqualTo(const ChunkMetadataVector &metadata_vec, const int frag_id)
static std::shared_ptr< Catalog > checkedGet(const int32_t db_id)
Definition: Catalog.cpp:3770
bool recoverCacheForTable(ChunkMetadataVector &, const ChunkKey &)
ChunkKey get_table_key(const ChunkKey &key)
Definition: types.h:52
void refreshTable(const ChunkKey &table_key, const bool evict_cached_entries) override
void cacheMetadataVec(const ChunkMetadataVector &)
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
std::vector< ChunkKey > get_keys_vec_from_table(const ChunkKey &destination_chunk_key)
bool isMetadataCached(const ChunkKey &) const
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunk_metadata, const ChunkKey &chunk_key_prefix) override
void getCachedMetadataVecForKeyPrefix(ChunkMetadataVector &, const ChunkKey &) const
void copyTo(AbstractBuffer *destination_buffer, const size_t num_bytes=0)
bool is_varlen_data_key(const ChunkKey &key)
Definition: types.h:65
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41
#define CHECK(condition)
Definition: Logger.h:197
std::vector< int > ChunkKey
Definition: types.h:37
void clearTempChunkBufferMapEntriesForTable(const ChunkKey &table_key)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata > >> ChunkMetadataVector
void fetchBuffer(const ChunkKey &chunk_key, AbstractBuffer *destination_buffer, const size_t num_bytes) override
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
bool createDataWrapperIfNotExists(const ChunkKey &chunk_key)
int getHighestCachedFragId(const ChunkKey &table_key)