OmniSciDB  0264ff685a
ParquetDataWrapper.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <map>
20 #include <unordered_set>
21 #include <vector>
22 
23 #include "Catalog/Catalog.h"
24 #include "Catalog/ForeignTable.h"
25 #include "DataMgr/Chunk/Chunk.h"
26 #include "ForeignDataWrapper.h"
27 #include "ForeignTableSchema.h"
28 #include "ImportExport/Importer.h"
29 #include "Interval.h"
30 #include "LazyParquetChunkLoader.h"
31 
32 namespace foreign_storage {
34  public:
35  ParquetDataWrapper(const int db_id, const ForeignTable* foreign_table);
36 
37  void populateChunkMetadata(ChunkMetadataVector& chunk_metadata_vector) override;
38 
40  std::map<ChunkKey, AbstractBuffer*>& required_buffers,
41  std::map<ChunkKey, AbstractBuffer*>& optional_buffers) override;
42 
43  static void validateOptions(const ForeignTable* foreign_table);
44 
45  static std::vector<std::string_view> getSupportedOptions();
46 
47  void serializeDataWrapperInternals(const std::string& file_path) const override;
48 
50  const std::string& file_path,
51  const ChunkMetadataVector& chunk_metadata_vector) override;
52 
53  bool isRestored() const override;
54 
55  private:
56  ParquetDataWrapper(const ForeignTable* foreign_table);
57 
58  std::list<const ColumnDescriptor*> getColumnsToInitialize(
59  const Interval<ColumnType>& column_interval);
60  void initializeChunkBuffers(const int fragment_index,
61  const Interval<ColumnType>& column_interval,
62  std::map<ChunkKey, AbstractBuffer*>& required_buffers,
63  const bool reserve_buffers_and_set_stats = false);
64  void fetchChunkMetadata();
66  const int logical_column_id,
67  const int fragment_id,
68  std::map<ChunkKey, AbstractBuffer*>& required_buffers);
69 
70  void validateFilePath() const;
71  std::set<std::string> getProcessedFilePaths();
72  std::set<std::string> getAllFilePaths();
73 
75 
85  std::string validateAndGetStringWithLength(const std::string& option_name,
86  const size_t expected_num_chars) const;
87 
88  bool moveToNextFragment(size_t new_rows_count) const;
89 
90  void finalizeFragmentMap();
91  void addNewFragment(int row_group, const std::string& file_path);
92 
93  bool isNewFile(const std::string& file_path) const;
94 
95  void addNewFile(const std::string& file_path);
96 
97  void resetParquetMetadata();
98 
99  void metadataScanFiles(const std::set<std::string>& file_paths);
100 
101  std::map<int, std::vector<RowGroupInterval>> fragment_to_row_group_interval_map_;
102  std::map<ChunkKey, std::shared_ptr<ChunkMetadata>> chunk_metadata_map_;
103  const int db_id_;
110  std::unique_ptr<ForeignTableSchema> schema_;
111  std::shared_ptr<arrow::fs::FileSystem> file_system_;
112 
113  static constexpr std::array<char const*, 0> supported_options_{};
114 };
115 } // namespace foreign_storage
std::set< std::string > getProcessedFilePaths()
std::set< std::string > getAllFilePaths()
void restoreDataWrapperInternals(const std::string &file_path, const ChunkMetadataVector &chunk_metadata_vector) override
static std::vector< std::string_view > getSupportedOptions()
std::unique_ptr< ForeignTableSchema > schema_
bool isNewFile(const std::string &file_path) const
void serializeDataWrapperInternals(const std::string &file_path) const override
ParquetDataWrapper(const int db_id, const ForeignTable *foreign_table)
std::map< int, std::vector< RowGroupInterval > > fragment_to_row_group_interval_map_
This file contains the class specification and related data structures for Catalog.
std::map< ChunkKey, std::shared_ptr< ChunkMetadata > > chunk_metadata_map_
void addNewFile(const std::string &file_path)
static constexpr std::array< char const *, 0 > supported_options_
void metadataScanFiles(const std::set< std::string > &file_paths)
void addNewFragment(int row_group, const std::string &file_path)
std::string validateAndGetStringWithLength(const std::string &option_name, const size_t expected_num_chars) const
void initializeChunkBuffers(const int fragment_index, const Interval< ColumnType > &column_interval, std::map< ChunkKey, AbstractBuffer *> &required_buffers, const bool reserve_buffers_and_set_stats=false)
std::shared_ptr< arrow::fs::FileSystem > file_system_
std::list< const ColumnDescriptor * > getColumnsToInitialize(const Interval< ColumnType > &column_interval)
void populateChunkMetadata(ChunkMetadataVector &chunk_metadata_vector) override
bool moveToNextFragment(size_t new_rows_count) const
import_export::CopyParams validateAndGetCopyParams() const
static void validateOptions(const ForeignTable *foreign_table)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata > >> ChunkMetadataVector
void loadBuffersUsingLazyParquetChunkLoader(const int logical_column_id, const int fragment_id, std::map< ChunkKey, AbstractBuffer *> &required_buffers)
void populateChunkBuffers(std::map< ChunkKey, AbstractBuffer *> &required_buffers, std::map< ChunkKey, AbstractBuffer *> &optional_buffers) override