OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetShared.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "ParquetShared.h"
18 
19 #include <parquet/column_scanner.h>
20 #include <parquet/exception.h>
21 #include <parquet/platform.h>
23 
24 namespace foreign_storage {
25 
26 UniqueReaderPtr open_parquet_table(const std::string& file_path,
27  std::shared_ptr<arrow::fs::FileSystem>& file_system) {
28  UniqueReaderPtr reader;
29  auto file_result = file_system->OpenInputFile(file_path);
30  if (!file_result.ok()) {
31  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
32  file_path + ". " + file_result.status().message()};
33  }
34  auto infile = file_result.ValueOrDie();
35  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
36  return reader;
37 }
38 
39 std::pair<int, int> get_parquet_table_size(const ReaderPtr& reader) {
40  auto file_metadata = reader->parquet_reader()->metadata();
41  const auto num_row_groups = file_metadata->num_row_groups();
42  const auto num_columns = file_metadata->num_columns();
43  return std::make_pair(num_row_groups, num_columns);
44 }
45 
46 const parquet::ColumnDescriptor* get_column_descriptor(
47  const parquet::arrow::FileReader* reader,
48  const int logical_column_index) {
49  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
50 }
51 
52 parquet::Type::type get_physical_type(ReaderPtr& reader, const int logical_column_index) {
53  return reader->parquet_reader()
54  ->metadata()
55  ->schema()
56  ->Column(logical_column_index)
57  ->physical_type();
58 }
59 
61  const parquet::ColumnDescriptor* reference_descriptor,
62  const parquet::ColumnDescriptor* new_descriptor,
63  const std::string& reference_file_path,
64  const std::string& new_file_path) {
65  if (!reference_descriptor->Equals(*new_descriptor)) {
66  throw std::runtime_error{"Parquet file \"" + new_file_path +
67  "\" has a different schema. Please ensure that all Parquet "
68  "files use the same schema. Reference Parquet file: " +
69  reference_file_path +
70  ", column name: " + reference_descriptor->name() +
71  ". New Parquet file: " + new_file_path +
72  ", column name: " + new_descriptor->name() + "."};
73  }
74 }
75 
76 std::unique_ptr<ColumnDescriptor> get_sub_type_column_descriptor(
77  const ColumnDescriptor* column) {
78  auto column_type = column->columnType.get_elem_type();
79  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
80  column_type.set_size(4); // override default size of -1
81  }
82  return std::make_unique<ColumnDescriptor>(
83  column->tableId, column->columnId, column->columnName, column_type, column->db_id);
84 }
85 
86 std::shared_ptr<parquet::Statistics> validate_and_get_column_metadata_statistics(
87  const parquet::ColumnChunkMetaData* column_metadata) {
88  CHECK(column_metadata->is_stats_set());
89  std::shared_ptr<parquet::Statistics> stats = column_metadata->statistics();
90  return stats;
91 }
92 
93 } // namespace foreign_storage
void set_size(int s)
Definition: sqltypes.h:478
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
std::pair< int, int > get_parquet_table_size(const ReaderPtr &reader)
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
parquet::Type::type get_physical_type(ReaderPtr &reader, const int logical_column_index)
dictionary stats
Definition: report.py:116
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
UniqueReaderPtr open_parquet_table(const std::string &file_path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
const parquet::ColumnDescriptor * get_column_descriptor(const parquet::arrow::FileReader *reader, const int logical_column_index)
specifies the content in-memory of a row in the column metadata table
parquet::arrow::FileReader * ReaderPtr
Definition: ParquetShared.h:33
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType
std::unique_ptr< parquet::arrow::FileReader > UniqueReaderPtr
Definition: ParquetShared.h:32
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:977
std::string columnName