OmniSciDB  95562058bd
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ParquetDataWrapper.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <map>
20 #include <unordered_set>
21 #include <vector>
22 
23 #include "Catalog/Catalog.h"
24 #include "Catalog/ForeignTable.h"
25 #include "DataMgr/Chunk/Chunk.h"
26 #include "ForeignDataWrapper.h"
27 #include "ForeignTableSchema.h"
28 #include "ImportExport/Importer.h"
29 #include "Interval.h"
30 #include "LazyParquetChunkLoader.h"
31 
32 namespace foreign_storage {
34  public:
35  ParquetDataWrapper(const int db_id, const ForeignTable* foreign_table);
36 
37  void populateChunkMetadata(ChunkMetadataVector& chunk_metadata_vector) override;
38 
40  std::map<ChunkKey, AbstractBuffer*>& required_buffers,
41  std::map<ChunkKey, AbstractBuffer*>& optional_buffers) override;
42 
43  static void validateOptions(const ForeignTable* foreign_table);
44 
45  static std::vector<std::string_view> getSupportedOptions();
46 
47  void serializeDataWrapperInternals(const std::string& file_path) const override;
48 
50  const std::string& file_path,
51  const ChunkMetadataVector& chunk_metadata_vector) override;
52 
53  bool isRestored() const override;
54 
55  private:
56  ParquetDataWrapper(const ForeignTable* foreign_table);
57 
58  std::list<const ColumnDescriptor*> getColumnsToInitialize(
59  const Interval<ColumnType>& column_interval);
60  void initializeChunkBuffers(const int fragment_index,
61  const Interval<ColumnType>& column_interval,
62  std::map<ChunkKey, AbstractBuffer*>& required_buffers,
63  const bool reserve_buffers_and_set_stats = false);
64  void fetchChunkMetadata();
66  const int logical_column_id,
67  const int fragment_id,
68  std::map<ChunkKey, AbstractBuffer*>& required_buffers);
69 
70  void validateFilePath() const;
71  std::string getConfiguredFilePath() const;
72  std::set<std::string> getProcessedFilePaths();
73  std::set<std::string> getAllFilePaths();
74 
76 
86  std::string validateAndGetStringWithLength(const std::string& option_name,
87  const size_t expected_num_chars) const;
88 
89  bool moveToNextFragment(size_t new_rows_count) const;
90 
91  void finalizeFragmentMap();
92  void addNewFragment(int row_group, const std::string& file_path);
93 
94  bool isNewFile(const std::string& file_path) const;
95 
96  void addNewFile(const std::string& file_path);
97 
98  void resetParquetMetadata();
99 
100  void metadataScanFiles(const std::set<std::string>& file_paths);
101 
102  std::map<int, std::vector<RowGroupInterval>> fragment_to_row_group_interval_map_;
103  std::map<ChunkKey, std::shared_ptr<ChunkMetadata>> chunk_metadata_map_;
104  const int db_id_;
111  std::unique_ptr<ForeignTableSchema> schema_;
112  std::shared_ptr<arrow::fs::FileSystem> file_system_;
113 
114  static constexpr std::array<char const*, 2> supported_options_{"BASE_PATH",
115  "FILE_PATH"};
116 };
117 } // namespace foreign_storage
void loadBuffersUsingLazyParquetChunkLoader(const int logical_column_id, const int fragment_id, std::map< ChunkKey, AbstractBuffer * > &required_buffers)
void populateChunkBuffers(std::map< ChunkKey, AbstractBuffer * > &required_buffers, std::map< ChunkKey, AbstractBuffer * > &optional_buffers) override
std::set< std::string > getProcessedFilePaths()
std::set< std::string > getAllFilePaths()
void restoreDataWrapperInternals(const std::string &file_path, const ChunkMetadataVector &chunk_metadata_vector) override
static std::vector< std::string_view > getSupportedOptions()
std::unique_ptr< ForeignTableSchema > schema_
void serializeDataWrapperInternals(const std::string &file_path) const override
ParquetDataWrapper(const int db_id, const ForeignTable *foreign_table)
import_export::CopyParams validateAndGetCopyParams() const
std::map< int, std::vector< RowGroupInterval > > fragment_to_row_group_interval_map_
This file contains the class specification and related data structures for Catalog.
std::map< ChunkKey, std::shared_ptr< ChunkMetadata > > chunk_metadata_map_
void addNewFile(const std::string &file_path)
void metadataScanFiles(const std::set< std::string > &file_paths)
void addNewFragment(int row_group, const std::string &file_path)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
bool isNewFile(const std::string &file_path) const
bool moveToNextFragment(size_t new_rows_count) const
std::shared_ptr< arrow::fs::FileSystem > file_system_
std::list< const ColumnDescriptor * > getColumnsToInitialize(const Interval< ColumnType > &column_interval)
void populateChunkMetadata(ChunkMetadataVector &chunk_metadata_vector) override
void initializeChunkBuffers(const int fragment_index, const Interval< ColumnType > &column_interval, std::map< ChunkKey, AbstractBuffer * > &required_buffers, const bool reserve_buffers_and_set_stats=false)
static constexpr std::array< char const *, 2 > supported_options_
static void validateOptions(const ForeignTable *foreign_table)
std::string validateAndGetStringWithLength(const std::string &option_name, const size_t expected_num_chars) const