OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetImporter.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <map>
20 #include <unordered_set>
21 #include <vector>
22 
24 #include "Catalog/CatalogFwd.h"
25 #include "ForeignDataWrapper.h"
26 #include "ForeignTableSchema.h"
28 #include "ImportExport/Importer.h"
29 #include "Interval.h"
30 #include "LazyParquetChunkLoader.h"
31 
32 namespace foreign_storage {
33 
35  public:
36  virtual ~AbstractRowGroupIntervalTracker() = default;
37  virtual std::optional<RowGroupInterval> getNextRowGroupInterval() = 0;
38 };
39 
41  public:
43 
44  ParquetImporter(const int db_id,
45  const ForeignTable* foreign_table,
46  const UserMapping* user_mapping);
47 
48  void populateChunkMetadata(ChunkMetadataVector& chunk_metadata_vector) override;
49 
50  void populateChunkBuffers(const ChunkToBufferMap& required_buffers,
51  const ChunkToBufferMap& optional_buffers,
52  AbstractBuffer* delete_buffer) override;
53 
54  std::string getSerializedDataWrapper() const override;
55 
57  const std::string& file_path,
58  const ChunkMetadataVector& chunk_metadata_vector) override;
59 
60  bool isRestored() const override;
61 
63  UNREACHABLE();
64  return {};
65  }
66 
68  UNREACHABLE();
69  return {};
70  }
71 
78  std::unique_ptr<import_export::ImportBatchResult> getNextImportBatch();
79 
85  std::vector<std::pair<const ColumnDescriptor*, StringDictionary*>>
86  getStringDictionaries() const;
87 
91  int getMaxNumUsefulThreads() const;
92 
96  void setNumThreads(const int num_threads);
97 
98  private:
99  const int db_id_;
102 
103  std::set<std::string> getAllFilePaths();
104 
105  std::unique_ptr<AbstractRowGroupIntervalTracker> row_group_interval_tracker_;
106 
107  std::unique_ptr<ForeignTableSchema> schema_;
108  std::shared_ptr<arrow::fs::FileSystem> file_system_;
109  std::unique_ptr<FileReaderMap> file_reader_cache_;
110  std::vector<std::pair<const ColumnDescriptor*, StringDictionary*>>
112 
115 };
116 } // namespace foreign_storage
std::set< std::string > getAllFilePaths()
std::shared_mutex row_group_interval_tracker_mutex_
std::shared_ptr< arrow::fs::FileSystem > file_system_
void setNumThreads(const int num_threads)
std::unique_ptr< AbstractRowGroupIntervalTracker > row_group_interval_tracker_
std::unique_ptr< ForeignTableSchema > schema_
void restoreDataWrapperInternals(const std::string &file_path, const ChunkMetadataVector &chunk_metadata_vector) override
#define UNREACHABLE()
Definition: Logger.h:338
std::map< ChunkKey, AbstractBuffer * > ChunkToBufferMap
std::shared_mutex string_dictionaries_per_column_mutex_
std::vector< std::pair< const ColumnDescriptor *, StringDictionary * > > getStringDictionaries() const
ParallelismLevel getCachedParallelismLevel() const override
void populateChunkBuffers(const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers, AbstractBuffer *delete_buffer) override
void populateChunkMetadata(ChunkMetadataVector &chunk_metadata_vector) override
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
An AbstractBuffer is a unit of data management for a data manager.
std::string getSerializedDataWrapper() const override
virtual std::optional< RowGroupInterval > getNextRowGroupInterval()=0
const ForeignTable * foreign_table_
std::unique_ptr< FileReaderMap > file_reader_cache_
ParallelismLevel getNonCachedParallelismLevel() const override
std::shared_timed_mutex shared_mutex
std::unique_ptr< import_export::ImportBatchResult > getNextImportBatch()
std::vector< std::pair< const ColumnDescriptor *, StringDictionary * > > string_dictionaries_per_column_