OmniSciDB  c0231cc57d
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
AbstractTextFileDataWrapper.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <map>
20 #include <queue>
21 #include <set>
22 
24 #include "Catalog/CatalogFwd.h"
25 #include "Catalog/ForeignTable.h"
26 #include "DataMgr/Chunk/Chunk.h"
30 
31 namespace foreign_storage {
32 
39  std::queue<ParseBufferRequest> pending_requests;
40  std::queue<ParseBufferRequest>
41  deferred_requests; // holds requests that will be processed in the next iteration
42  // during an iterative file scan
45  std::condition_variable pending_requests_condition;
46  std::queue<ParseBufferRequest> request_pool;
47  std::mutex request_pool_mutex;
48  std::condition_variable request_pool_condition;
50  std::map<ChunkKey, std::unique_ptr<ForeignStorageBuffer>> chunk_encoder_buffers;
51  std::map<ChunkKey, Chunk_NS::Chunk> cached_chunks;
54 };
55 
57  std::map<int, Chunk_NS::Chunk>& column_id_to_chunk_map;
58  int32_t fragment_id;
60 
61  mutable std::map<int, std::unique_ptr<std::mutex>> column_id_to_chunk_mutex;
62  mutable std::map<int, std::unique_ptr<std::condition_variable>>
64  mutable std::mutex delete_buffer_mutex;
65 
66  IterativeFileScanParameters(std::map<int, Chunk_NS::Chunk>& column_id_to_chunk_map,
67  int32_t fragment_id,
69  : column_id_to_chunk_map(column_id_to_chunk_map)
70  , fragment_id(fragment_id)
71  , delete_buffer(delete_buffer) {
72  for (const auto& [key, _] : column_id_to_chunk_map) {
73  column_id_to_chunk_mutex[key] = std::make_unique<std::mutex>();
75  std::make_unique<std::condition_variable>();
76  }
77  }
78 
79  std::mutex& getChunkMutex(const int col_id) const {
80  auto mutex_it = column_id_to_chunk_mutex.find(col_id);
81  CHECK(mutex_it != column_id_to_chunk_mutex.end());
82  return *mutex_it->second;
83  }
84 
85  std::condition_variable& getChunkConditionalVariable(const int col_id) const {
86  auto var_it = column_id_to_chunk_conditional_var.find(col_id);
88  return *var_it->second;
89  }
90 };
91 
93  public:
95 
96  AbstractTextFileDataWrapper(const int db_id, const ForeignTable* foreign_table);
97 
98  AbstractTextFileDataWrapper(const int db_id,
99  const ForeignTable* foreign_table,
100  const UserMapping* user_mapping,
101  const bool disable_cache);
102 
103  void populateChunkMetadata(ChunkMetadataVector& chunk_metadata_vector) override;
104 
105  void populateChunkBuffers(const ChunkToBufferMap& required_buffers,
106  const ChunkToBufferMap& optional_buffers,
107  AbstractBuffer* delete_buffer) override;
108 
109  std::string getSerializedDataWrapper() const override;
110 
111  void restoreDataWrapperInternals(const std::string& file_path,
112  const ChunkMetadataVector& chunk_metadata) override;
113  bool isRestored() const override;
114 
116 
118  return INTRA_FRAGMENT;
119  }
120 
121  void createRenderGroupAnalyzers() override;
122 
123  bool isLazyFragmentFetchingEnabled() const override { return true; }
124 
125  struct ResidualBuffer {
126  std::unique_ptr<char[]> residual_data;
127  size_t alloc_size;
130  };
131 
132  protected:
133  virtual const TextFileBufferParser& getFileBufferParser() const = 0;
134  virtual std::optional<size_t> getMaxFileCount() const;
135 
136  private:
137  AbstractTextFileDataWrapper(const ForeignTable* foreign_table);
138 
143  void iterativeFileScan(ChunkMetadataVector& chunk_metadata_vector,
144  IterativeFileScanParameters& file_scan_param);
145 
154  void populateChunks(std::map<int, Chunk_NS::Chunk>& column_id_to_chunk_map,
155  int fragment_id,
156  AbstractBuffer* delete_buffer);
157 
158  void populateChunkMapForColumns(const std::set<const ColumnDescriptor*>& columns,
159  const int fragment_id,
160  const ChunkToBufferMap& buffers,
161  std::map<int, Chunk_NS::Chunk>& column_id_to_chunk_map);
162 
163  void updateMetadata(std::map<int, Chunk_NS::Chunk>& column_id_to_chunk_map,
164  int fragment_id);
165 
167  const std::set<std::string>& rolled_off_files,
168  const std::map<int32_t, const ColumnDescriptor*>& column_by_id);
169 
170  std::map<ChunkKey, std::shared_ptr<ChunkMetadata>> chunk_metadata_map_;
171  std::map<int, FileRegions> fragment_id_to_file_regions_map_;
172 
173  std::unique_ptr<FileReader> file_reader_;
174 
175  const int db_id_;
177 
178  // Data needed for append workflow
179  std::map<ChunkKey, std::unique_ptr<ForeignStorageBuffer>> chunk_encoder_buffers_;
180  // How many rows have been read
181  size_t num_rows_;
182  // What byte offset we left off at in the file_reader
184  // Is this datawrapper restored from disk
186 
188 
189  // Force cache to be disabled
190  const bool disable_cache_;
191 
194 
195  // declared in three derived classes to avoid
196  // polluting ForeignDataWrapper virtual base
197  // @TODO refactor to lower class if needed
199 
200  // These parameters may be reused in a reentrant metadata scan
202  size_t buffer_size_;
204 
206 };
207 } // namespace foreign_storage
std::condition_variable & getChunkConditionalVariable(const int col_id) const
void updateRolledOffChunks(const std::set< std::string > &rolled_off_files, const std::map< int32_t, const ColumnDescriptor * > &column_by_id)
std::map< ChunkKey, AbstractBuffer * > ChunkToBufferMap
std::map< ChunkKey, std::unique_ptr< ForeignStorageBuffer > > chunk_encoder_buffers_
std::map< int, std::unique_ptr< std::mutex > > column_id_to_chunk_mutex
virtual const TextFileBufferParser & getFileBufferParser() const =0
std::map< ChunkKey, std::unique_ptr< ForeignStorageBuffer > > chunk_encoder_buffers
std::map< int, Chunk_NS::Chunk > & column_id_to_chunk_map
void iterativeFileScan(ChunkMetadataVector &chunk_metadata_vector, IterativeFileScanParameters &file_scan_param)
void createRenderGroupAnalyzers() override
Create RenderGroupAnalyzers for poly columns.
void populateChunkMetadata(ChunkMetadataVector &chunk_metadata_vector) override
MetadataScanMultiThreadingParams multi_threading_params_
std::map< ChunkKey, std::shared_ptr< ChunkMetadata > > chunk_metadata_map_
void updateMetadata(std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map, int fragment_id)
virtual std::optional< size_t > getMaxFileCount() const
ParallelismLevel getNonCachedParallelismLevel() const override
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
An AbstractBuffer is a unit of data management for a data manager.
std::mutex & getChunkMutex(const int col_id) const
std::map< int, std::unique_ptr< std::condition_variable > > column_id_to_chunk_conditional_var
void populateChunks(std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map, int fragment_id, AbstractBuffer *delete_buffer)
ParallelismLevel getCachedParallelismLevel() const override
void populateChunkBuffers(const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers, AbstractBuffer *delete_buffer) override
void restoreDataWrapperInternals(const std::string &file_path, const ChunkMetadataVector &chunk_metadata) override
#define CHECK(condition)
Definition: Logger.h:222
IterativeFileScanParameters(std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map, int32_t fragment_id, AbstractBuffer *delete_buffer)
void populateChunkMapForColumns(const std::set< const ColumnDescriptor * > &columns, const int fragment_id, const ChunkToBufferMap &buffers, std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map)
std::map< int, std::unique_ptr< import_export::RenderGroupAnalyzer >> RenderGroupAnalyzerMap