OmniSciDB  85c2d10cdc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
CachingForeignStorageMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 #include "Catalog/ForeignTable.h"
19 #include "CsvDataWrapper.h"
21 #include "ForeignTableSchema.h"
22 #include "ParquetDataWrapper.h"
23 
24 namespace foreign_storage {
25 
26 namespace {
27 constexpr int64_t MAX_REFRESH_TIME_IN_SECONDS = 60 * 60;
28 } // namespace
29 
31  : ForeignStorageMgr(), disk_cache_(cache) {
33 }
34 
36  ForeignDataWrapper& data_wrapper,
37  ChunkToBufferMap& required_buffers,
38  ChunkToBufferMap& optional_buffers) {
39  try {
40  data_wrapper.populateChunkBuffers(required_buffers, optional_buffers);
41  } catch (const std::runtime_error& error) {
42  // clear any partially loaded but failed chunks (there may be some
43  // fully-loaded chunks as well but they will be cleared conservatively
44  // anyways)
45  for (const auto& [chunk_key, buffer] : required_buffers) {
46  if (auto file_buffer = dynamic_cast<File_Namespace::FileBuffer*>(buffer)) {
47  file_buffer->freeChunkPages();
48  }
49  }
50  for (const auto& [chunk_key, buffer] : optional_buffers) {
51  if (auto file_buffer = dynamic_cast<File_Namespace::FileBuffer*>(buffer)) {
52  file_buffer->freeChunkPages();
53  }
54  }
55 
56  throw ForeignStorageException(error.what());
57  }
58 }
59 
61  AbstractBuffer* destination_buffer,
62  const size_t num_bytes) {
63  CHECK(destination_buffer);
64  CHECK(!destination_buffer->isDirty());
65 
67 
68  // TODO: Populate optional buffers as part of CSV performance improvement
69  std::vector<ChunkKey> chunk_keys = get_keys_vec_from_table(chunk_key);
70  std::vector<ChunkKey> optional_keys;
71  ChunkToBufferMap optional_buffers;
72 
73  // Use hints to prefetch other chunks in fragment into cache
74  auto& data_wrapper = *getDataWrapper(chunk_key);
75  std::set<ChunkKey> optional_set;
76  getOptionalChunkKeySet(optional_set,
77  chunk_key,
78  get_keys_set_from_table(chunk_key),
79  data_wrapper.getCachedParallelismLevel());
80  for (const auto& key : optional_set) {
81  if (disk_cache_->getCachedChunkIfExists(key) == nullptr) {
82  optional_keys.emplace_back(key);
83  }
84  }
85 
86  if (optional_keys.size()) {
87  optional_buffers = disk_cache_->getChunkBuffersForCaching(optional_keys);
88  }
89 
90  ChunkToBufferMap required_buffers = disk_cache_->getChunkBuffersForCaching(chunk_keys);
91  CHECK(required_buffers.find(chunk_key) != required_buffers.end());
92  populateChunkBuffersSafely(data_wrapper, required_buffers, optional_buffers);
93  disk_cache_->cacheTableChunks(chunk_keys);
94  if (optional_keys.size()) {
95  disk_cache_->cacheTableChunks(optional_keys);
96  }
97 
98  AbstractBuffer* buffer = required_buffers.at(chunk_key);
99  CHECK(buffer);
100 
101  buffer->copyTo(destination_buffer, num_bytes);
102 }
103 
105  ChunkMetadataVector& chunk_metadata,
106  const ChunkKey& keyPrefix) {
107  auto [db_id, tb_id] = get_table_prefix(keyPrefix);
108  ForeignStorageMgr::getChunkMetadataVecForKeyPrefix(chunk_metadata, keyPrefix);
109  getDataWrapper(keyPrefix)->serializeDataWrapperInternals(
111 }
112 
114  const ChunkKey& table_key,
115  const ChunkMetadataVector& chunk_metadata) {
116  auto [db_id, tb_id] = get_table_prefix(table_key);
117  getDataWrapper(table_key)->restoreDataWrapperInternals(
119  chunk_metadata);
120 }
121 
123  const bool evict_cached_entries) {
124  CHECK(is_table_key(table_key));
127  evict_cached_entries ? disk_cache_->clearForTablePrefix(table_key)
128  : refreshTableInCache(table_key);
129 }
130 
132  CHECK(is_table_key(table_key));
133 
134  // Before we can refresh a table we should make sure it has recovered any data
135  // if the table has been unused since the last server restart.
136  if (!disk_cache_->hasCachedMetadataForKeyPrefix(table_key)) {
137  ChunkMetadataVector old_cached_metadata;
138  disk_cache_->recoverCacheForTable(old_cached_metadata, table_key);
139  }
140 
141  // Preserve the list of which chunks were cached per table to refresh after clear.
142  std::vector<ChunkKey> old_chunk_keys =
144  auto catalog =
146  CHECK(catalog);
147  bool append_mode =
148  catalog->getForeignTableUnlocked(table_key[CHUNK_KEY_TABLE_IDX])->isAppendMode();
149 
150  append_mode ? refreshAppendTableInCache(table_key, old_chunk_keys)
151  : refreshNonAppendTableInCache(table_key, old_chunk_keys);
152 }
153 
155  // Determine last fragment ID
156  int last_frag_id = 0;
157  if (disk_cache_->hasCachedMetadataForKeyPrefix(table_key)) {
158  ChunkMetadataVector cached_metadata;
159  disk_cache_->getCachedMetadataVecForKeyPrefix(cached_metadata, table_key);
160  for (const auto& [key, metadata] : cached_metadata) {
161  last_frag_id = std::max(last_frag_id, key[CHUNK_KEY_FRAGMENT_IDX]);
162  }
163  }
164  return last_frag_id;
165 }
166 
168  const ChunkKey& table_key,
169  const std::vector<ChunkKey>& old_chunk_keys) {
170  CHECK(is_table_key(table_key));
172  int last_frag_id = getHighestCachedFragId(table_key);
173 
174  ChunkMetadataVector storage_metadata;
175  getChunkMetadataVecForKeyPrefix(storage_metadata, table_key);
176  try {
177  disk_cache_->cacheMetadataWithFragIdGreaterOrEqualTo(storage_metadata, last_frag_id);
178  refreshChunksInCacheByFragment(old_chunk_keys, last_frag_id);
179  } catch (std::runtime_error& e) {
181  }
182 }
183 
185  const ChunkKey& table_key,
186  const std::vector<ChunkKey>& old_chunk_keys) {
187  CHECK(is_table_key(table_key));
188  ChunkMetadataVector storage_metadata;
189  disk_cache_->clearForTablePrefix(table_key);
190  getChunkMetadataVecForKeyPrefix(storage_metadata, table_key);
191 
192  try {
193  disk_cache_->cacheMetadataVec(storage_metadata);
194  refreshChunksInCacheByFragment(old_chunk_keys, 0);
195  } catch (std::runtime_error& e) {
197  }
198 }
199 
201  const std::vector<ChunkKey>& old_chunk_keys,
202  int start_frag_id) {
203  int64_t total_time{0};
204  auto fragment_refresh_start_time = std::chrono::high_resolution_clock::now();
205 
206  if (old_chunk_keys.empty()) {
207  return;
208  }
209  // Iterate through previously cached chunks and re-cache them. Caching is
210  // done one fragment at a time, for all applicable chunks in the fragment.
211  ChunkToBufferMap optional_buffers;
212  std::vector<ChunkKey> chunk_keys_to_be_cached;
213  auto fragment_id = old_chunk_keys[0][CHUNK_KEY_FRAGMENT_IDX];
214  const ChunkKey table_key{get_table_key(old_chunk_keys[0])};
215  std::vector<ChunkKey> chunk_keys_in_fragment;
216  for (const auto& chunk_key : old_chunk_keys) {
217  if (chunk_key[CHUNK_KEY_FRAGMENT_IDX] < start_frag_id) {
218  continue;
219  }
220  if (disk_cache_->isMetadataCached(chunk_key)) {
221  if (chunk_key[CHUNK_KEY_FRAGMENT_IDX] != fragment_id) {
222  if (chunk_keys_in_fragment.size() > 0) {
223  auto required_buffers =
224  disk_cache_->getChunkBuffersForCaching(chunk_keys_in_fragment);
226  *getDataWrapper(table_key), required_buffers, optional_buffers);
227  chunk_keys_in_fragment.clear();
228  }
229  // At this point, cache buffers for refreshable chunks in the last fragment
230  // have been populated. Exit if the max refresh time has been exceeded.
231  // Otherwise, move to the next fragment.
232  auto current_time = std::chrono::high_resolution_clock::now();
233  total_time += std::chrono::duration_cast<std::chrono::seconds>(
234  current_time - fragment_refresh_start_time)
235  .count();
236  if (total_time >= MAX_REFRESH_TIME_IN_SECONDS) {
237  LOG(WARNING) << "Refresh time exceeded for table key: { " << table_key[0]
238  << ", " << table_key[1] << " } after fragment id: " << fragment_id;
239  break;
240  } else {
241  fragment_refresh_start_time = std::chrono::high_resolution_clock::now();
242  }
243  fragment_id = chunk_key[CHUNK_KEY_FRAGMENT_IDX];
244  }
245  // Key may have been cached during scan
246  if (disk_cache_->getCachedChunkIfExists(chunk_key) == nullptr) {
247  if (is_varlen_key(chunk_key)) {
248  CHECK(is_varlen_data_key(chunk_key));
249  ChunkKey index_chunk_key{chunk_key[CHUNK_KEY_DB_IDX],
250  chunk_key[CHUNK_KEY_TABLE_IDX],
251  chunk_key[CHUNK_KEY_COLUMN_IDX],
252  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
253  2};
254  chunk_keys_in_fragment.emplace_back(index_chunk_key);
255  chunk_keys_to_be_cached.emplace_back(index_chunk_key);
256  }
257  chunk_keys_in_fragment.emplace_back(chunk_key);
258  chunk_keys_to_be_cached.emplace_back(chunk_key);
259  }
260  }
261  }
262  if (chunk_keys_in_fragment.size() > 0) {
263  auto required_buffers =
264  disk_cache_->getChunkBuffersForCaching(chunk_keys_in_fragment);
266  *getDataWrapper(table_key), required_buffers, optional_buffers);
267  }
268  if (chunk_keys_to_be_cached.size() > 0) {
269  disk_cache_->cacheTableChunks(chunk_keys_to_be_cached);
270  }
271 }
272 
274  const ChunkKey& chunk_key) {
275  ChunkKey table_key = get_table_key(chunk_key);
276  if (createDataWrapperIfNotExists(table_key)) {
277  ChunkMetadataVector chunk_metadata;
278  if (disk_cache_->hasCachedMetadataForKeyPrefix(table_key)) {
279  disk_cache_->getCachedMetadataVecForKeyPrefix(chunk_metadata, table_key);
280  recoverDataWrapperFromDisk(table_key, chunk_metadata);
281  } else {
282  getDataWrapper(table_key)->populateChunkMetadata(chunk_metadata);
283  }
284  }
285 }
286 
287 } // namespace foreign_storage
std::vector< ChunkKey > get_keys_vec_from_table(const ChunkKey &destination_chunk_key)
void refreshAppendTableInCache(const ChunkKey &table_key, const std::vector< ChunkKey > &old_chunk_keys)
std::vector< int > ChunkKey
Definition: types.h:37
bool isMetadataCached(const ChunkKey &) const
std::shared_ptr< ForeignDataWrapper > getDataWrapper(const ChunkKey &chunk_key)
std::set< ChunkKey > get_keys_set_from_table(const ChunkKey &destination_chunk_key)
bool is_table_key(const ChunkKey &key)
Definition: types.h:44
bool is_varlen_data_key(const ChunkKey &key)
Definition: types.h:70
#define LOG(tag)
Definition: Logger.h:188
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
static void checkIfS3NeedsToBeEnabled(const ChunkKey &chunk_key)
std::map< ChunkKey, AbstractBuffer * > ChunkToBufferMap
void cacheTableChunks(const std::vector< ChunkKey > &chunk_keys)
void refreshTableInCache(const ChunkKey &table_key)
ChunkKey get_table_key(const ChunkKey &key)
Definition: types.h:52
AbstractBuffer * getCachedChunkIfExists(const ChunkKey &)
void refreshNonAppendTableInCache(const ChunkKey &table_key, const std::vector< ChunkKey > &old_chunk_keys)
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunk_metadata, const ChunkKey &chunk_key_prefix) override
void getCachedMetadataVecForKeyPrefix(ChunkMetadataVector &, const ChunkKey &) const
static SysCatalog & instance()
Definition: SysCatalog.h:292
void createOrRecoverDataWrapperIfNotExists(const ChunkKey &chunk_key)
void getOptionalChunkKeySet(std::set< ChunkKey > &optional_chunk_keys, const ChunkKey &chunk_key, const std::set< ChunkKey > &required_chunk_keys, const ForeignDataWrapper::ParallelismLevel parallelism_level)
int count
void refreshChunksInCacheByFragment(const std::vector< ChunkKey > &old_chunk_keys, int last_frag_id)
void recoverDataWrapperFromDisk(const ChunkKey &table_key, const ChunkMetadataVector &chunk_metadata)
const std::string wrapper_file_name
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
An AbstractBuffer is a unit of data management for a data manager.
void cacheMetadataWithFragIdGreaterOrEqualTo(const ChunkMetadataVector &metadata_vec, const int frag_id)
bool recoverCacheForTable(ChunkMetadataVector &, const ChunkKey &)
void populateChunkBuffersSafely(ForeignDataWrapper &data_wrapper, ChunkToBufferMap &required_buffers, ChunkToBufferMap &optional_buffers)
void refreshTable(const ChunkKey &table_key, const bool evict_cached_entries) override
void cacheMetadataVec(const ChunkMetadataVector &)
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
void getChunkMetadataVecForKeyPrefix(ChunkMetadataVector &chunk_metadata, const ChunkKey &chunk_key_prefix) override
std::string getCacheDirectoryForTable(int db_id, int tb_id) const
virtual void populateChunkBuffers(const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers)=0
std::vector< ChunkKey > getCachedChunksForKeyPrefix(const ChunkKey &) const
void copyTo(AbstractBuffer *destination_buffer, const size_t num_bytes=0)
std::pair< int, int > get_table_prefix(const ChunkKey &key)
Definition: types.h:57
#define CHECK(condition)
Definition: Logger.h:197
bool hasCachedMetadataForKeyPrefix(const ChunkKey &) const
void clearTempChunkBufferMapEntriesForTable(const ChunkKey &table_key)
ChunkToBufferMap getChunkBuffersForCaching(const std::vector< ChunkKey > &chunk_keys) const
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41
void fetchBuffer(const ChunkKey &chunk_key, AbstractBuffer *destination_buffer, const size_t num_bytes) override
bool createDataWrapperIfNotExists(const ChunkKey &chunk_key)
bool is_varlen_key(const ChunkKey &key)
Definition: types.h:66
int getHighestCachedFragId(const ChunkKey &table_key)