OmniSciDB  2e3a973ef4
ParquetShared.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "ParquetShared.h"
18 
19 #include <parquet/column_scanner.h>
20 #include <parquet/exception.h>
21 #include <parquet/platform.h>
22 
23 namespace foreign_storage {
24 
25 void open_parquet_table(const std::string& file_path,
26  std::unique_ptr<parquet::arrow::FileReader>& reader,
27  std::shared_ptr<arrow::fs::FileSystem>& file_system) {
28  auto file_result = file_system->OpenInputFile(file_path);
29  if (!file_result.ok()) {
30  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
31  file_path + ". " + file_result.status().message()};
32  }
33  auto infile = file_result.ValueOrDie();
34  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
35 }
36 
37 std::pair<int, int> get_parquet_table_size(
38  const std::unique_ptr<parquet::arrow::FileReader>& reader) {
39  auto file_metadata = reader->parquet_reader()->metadata();
40  const auto num_row_groups = file_metadata->num_row_groups();
41  const auto num_columns = file_metadata->num_columns();
42  return std::make_pair(num_row_groups, num_columns);
43 }
44 
45 const parquet::ColumnDescriptor* get_column_descriptor(
46  const parquet::arrow::FileReader* reader,
47  const int logical_column_index) {
48  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
49 }
50 
51 parquet::Type::type get_physical_type(std::unique_ptr<parquet::arrow::FileReader>& reader,
52  const int logical_column_index) {
53  return reader->parquet_reader()
54  ->metadata()
55  ->schema()
56  ->Column(logical_column_index)
57  ->physical_type();
58 }
59 
61  const parquet::ColumnDescriptor* reference_descriptor,
62  const parquet::ColumnDescriptor* new_descriptor,
63  const std::string& reference_file_path,
64  const std::string& new_file_path) {
65  if (!reference_descriptor->Equals(*new_descriptor)) {
66  throw std::runtime_error{"Parquet file \"" + new_file_path +
67  "\" has a different schema. Please ensure that all Parquet "
68  "files use the same schema. Reference Parquet file: " +
69  reference_file_path +
70  ", column name: " + reference_descriptor->name() +
71  ". New Parquet file: " + new_file_path +
72  ", column name: " + new_descriptor->name() + "."};
73  }
74 }
75 
76 std::unique_ptr<ColumnDescriptor> get_sub_type_column_descriptor(
77  const ColumnDescriptor* column) {
78  auto column_type = column->columnType.get_elem_type();
79  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
80  column_type.set_size(4); // override default size of -1
81  }
82  return std::make_unique<ColumnDescriptor>(
83  column->tableId, column->columnId, column->columnName, column_type);
84 }
85 
86 } // namespace foreign_storage
void set_size(int s)
Definition: sqltypes.h:357
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
std::pair< int, int > get_parquet_table_size(const std::unique_ptr< parquet::arrow::FileReader > &reader)
const parquet::ColumnDescriptor * get_column_descriptor(const parquet::arrow::FileReader *reader, const int logical_column_index)
specifies the content in-memory of a row in the column metadata table
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:624
void open_parquet_table(const std::string &file_path, std::unique_ptr< parquet::arrow::FileReader > &reader, std::shared_ptr< arrow::fs::FileSystem > &file_system)
SQLTypeInfo columnType
parquet::Type::type get_physical_type(std::unique_ptr< parquet::arrow::FileReader > &reader, const int logical_column_index)
std::string columnName