OmniSciDB  85c2d10cdc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ParquetShared.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "ParquetShared.h"
18 
19 #include <parquet/column_scanner.h>
20 #include <parquet/exception.h>
21 #include <parquet/platform.h>
22 
23 namespace foreign_storage {
24 
25 UniqueReaderPtr open_parquet_table(const std::string& file_path,
26  std::shared_ptr<arrow::fs::FileSystem>& file_system) {
27  UniqueReaderPtr reader;
28  auto file_result = file_system->OpenInputFile(file_path);
29  if (!file_result.ok()) {
30  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
31  file_path + ". " + file_result.status().message()};
32  }
33  auto infile = file_result.ValueOrDie();
34  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
35  return reader;
36 }
37 
38 std::pair<int, int> get_parquet_table_size(const ReaderPtr& reader) {
39  auto file_metadata = reader->parquet_reader()->metadata();
40  const auto num_row_groups = file_metadata->num_row_groups();
41  const auto num_columns = file_metadata->num_columns();
42  return std::make_pair(num_row_groups, num_columns);
43 }
44 
45 const parquet::ColumnDescriptor* get_column_descriptor(
46  const parquet::arrow::FileReader* reader,
47  const int logical_column_index) {
48  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
49 }
50 
51 parquet::Type::type get_physical_type(ReaderPtr& reader, const int logical_column_index) {
52  return reader->parquet_reader()
53  ->metadata()
54  ->schema()
55  ->Column(logical_column_index)
56  ->physical_type();
57 }
58 
60  const parquet::ColumnDescriptor* reference_descriptor,
61  const parquet::ColumnDescriptor* new_descriptor,
62  const std::string& reference_file_path,
63  const std::string& new_file_path) {
64  if (!reference_descriptor->Equals(*new_descriptor)) {
65  throw std::runtime_error{"Parquet file \"" + new_file_path +
66  "\" has a different schema. Please ensure that all Parquet "
67  "files use the same schema. Reference Parquet file: " +
68  reference_file_path +
69  ", column name: " + reference_descriptor->name() +
70  ". New Parquet file: " + new_file_path +
71  ", column name: " + new_descriptor->name() + "."};
72  }
73 }
74 
75 std::unique_ptr<ColumnDescriptor> get_sub_type_column_descriptor(
76  const ColumnDescriptor* column) {
77  auto column_type = column->columnType.get_elem_type();
78  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
79  column_type.set_size(4); // override default size of -1
80  }
81  return std::make_unique<ColumnDescriptor>(
82  column->tableId, column->columnId, column->columnName, column_type);
83 }
84 
85 std::shared_ptr<parquet::Statistics> validate_and_get_column_metadata_statistics(
86  const parquet::ColumnChunkMetaData* column_metadata) {
87  CHECK(column_metadata->is_stats_set());
88  std::shared_ptr<parquet::Statistics> stats = column_metadata->statistics();
89  bool is_all_nulls = stats->null_count() == column_metadata->num_values();
90  CHECK(is_all_nulls || stats->HasMinMax());
91  return stats;
92 }
93 
94 } // namespace foreign_storage
void set_size(int s)
Definition: sqltypes.h:412
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
std::pair< int, int > get_parquet_table_size(const ReaderPtr &reader)
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
parquet::Type::type get_physical_type(ReaderPtr &reader, const int logical_column_index)
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
UniqueReaderPtr open_parquet_table(const std::string &file_path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
const parquet::ColumnDescriptor * get_column_descriptor(const parquet::arrow::FileReader *reader, const int logical_column_index)
specifies the content in-memory of a row in the column metadata table
parquet::arrow::FileReader * ReaderPtr
Definition: ParquetShared.h:33
#define CHECK(condition)
Definition: Logger.h:197
SQLTypeInfo columnType
std::unique_ptr< parquet::arrow::FileReader > UniqueReaderPtr
Definition: ParquetShared.h:32
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:712
std::string columnName