OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetShared.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <arrow/api.h>
20 #include <arrow/filesystem/filesystem.h>
21 #include <arrow/io/api.h>
22 #include <parquet/arrow/reader.h>
23 #include <parquet/statistics.h>
24 #include <parquet/types.h>
25 
26 #include "Catalog/CatalogFwd.h"
27 #include "DataMgr/ChunkMetadata.h"
29 
30 namespace foreign_storage {
31 
32 using UniqueReaderPtr = std::unique_ptr<parquet::arrow::FileReader>;
33 using ReaderPtr = parquet::arrow::FileReader*;
34 
36  std::string file_path;
37  int start_index{-1}, end_index{-1};
38 };
39 
41  std::string file_path;
43  std::list<std::shared_ptr<ChunkMetadata>> column_chunk_metadata;
44 };
45 
46 UniqueReaderPtr open_parquet_table(const std::string& file_path,
47  std::shared_ptr<arrow::fs::FileSystem>& file_system);
48 
49 std::pair<int, int> get_parquet_table_size(const ReaderPtr& reader);
50 
51 const parquet::ColumnDescriptor* get_column_descriptor(
52  const parquet::arrow::FileReader* reader,
53  const int logical_column_index);
54 
56  const parquet::ColumnDescriptor* reference_descriptor,
57  const parquet::ColumnDescriptor* new_descriptor,
58  const std::string& reference_file_path,
59  const std::string& new_file_path);
60 
61 std::unique_ptr<ColumnDescriptor> get_sub_type_column_descriptor(
62  const ColumnDescriptor* column);
63 
64 std::shared_ptr<parquet::Statistics> validate_and_get_column_metadata_statistics(
65  const parquet::ColumnChunkMetaData* column_metadata);
66 
67 // A cache for parquet FileReaders which locks access for parallel use.
69  public:
70  const ReaderPtr getOrInsert(const std::string& path,
71  std::shared_ptr<arrow::fs::FileSystem>& file_system) {
73  if (map_.count(path) < 1 || !(map_.at(path))) {
74  map_[path] = open_parquet_table(path, file_system);
75  }
76  return map_.at(path).get();
77  }
78 
79  const ReaderPtr insert(const std::string& path,
80  std::shared_ptr<arrow::fs::FileSystem>& file_system) {
82  map_[path] = open_parquet_table(path, file_system);
83  return map_.at(path).get();
84  }
85 
86  void initializeIfEmpty(const std::string& path) {
88  if (map_.count(path) < 1) {
89  map_.emplace(path, UniqueReaderPtr());
90  }
91  }
92 
93  void clear() {
95  map_.clear();
96  }
97 
98  private:
99  mutable std::mutex mutex_;
100  std::map<const std::string, UniqueReaderPtr> map_;
101 };
102 } // namespace foreign_storage
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
std::pair< int, int > get_parquet_table_size(const ReaderPtr &reader)
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
UniqueReaderPtr open_parquet_table(const std::string &file_path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
const parquet::ColumnDescriptor * get_column_descriptor(const parquet::arrow::FileReader *reader, const int logical_column_index)
std::list< std::shared_ptr< ChunkMetadata > > column_chunk_metadata
Definition: ParquetShared.h:43
const ReaderPtr getOrInsert(const std::string &path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
Definition: ParquetShared.h:70
std::unique_lock< T > unique_lock
specifies the content in-memory of a row in the column metadata table
parquet::arrow::FileReader * ReaderPtr
Definition: ParquetShared.h:33
const ReaderPtr insert(const std::string &path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
Definition: ParquetShared.h:79
void initializeIfEmpty(const std::string &path)
Definition: ParquetShared.h:86
std::map< const std::string, UniqueReaderPtr > map_
std::unique_ptr< parquet::arrow::FileReader > UniqueReaderPtr
Definition: ParquetShared.h:32