OmniSciDB  85c2d10cdc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ParquetFixedLengthEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "ParquetInPlaceEncoder.h"
20 
21 namespace foreign_storage {
22 
23 // ParquetFixedLengthEncoder is used in two separate use cases: metadata
24 // scanning & chunk loading. During metadata scan the type of metadata (& in
25 // some cases data) must be known, while during chunk loading only the type of
26 // data needs to be known.
27 //
28 // The following semantics apply to the templated types below.
29 //
30 // At metadata scan:
31 // V - type of metadata (for loading metadata)
32 // T - physical type of parquet data
33 //
34 // At chunk load:
35 // V - type of data (to load data)
36 // T - physical type of parquet data
37 template <typename V, typename T>
40  public:
42  const ColumnDescriptor* column_desciptor,
43  const parquet::ColumnDescriptor* parquet_column_descriptor)
44  : TypedParquetInPlaceEncoder<V, T>(buffer,
45  column_desciptor,
46  parquet_column_descriptor) {}
47 
49  const size_t omnisci_data_type_byte_size,
50  const size_t parquet_data_type_byte_size)
51  : TypedParquetInPlaceEncoder<V, T>(buffer,
52  omnisci_data_type_byte_size,
53  parquet_data_type_byte_size) {}
54 
55  void encodeAndCopy(const int8_t* parquet_data_bytes,
56  int8_t* omnisci_data_bytes) override {
57  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data_bytes)[0];
58  auto& omnisci_data_value = reinterpret_cast<V*>(omnisci_data_bytes)[0];
59  omnisci_data_value = parquet_data_value;
60  }
61 
62  void validate(std::shared_ptr<parquet::Statistics> stats,
63  const SQLTypeInfo& column_type) const override {
64  validateIntegralOrFloatingPointValue(stats, column_type);
65  }
66 
67  bool encodingIsIdentityForSameTypes() const override { return true; }
68 
69  private:
70  template <
71  typename TT = T,
72  std::enable_if_t<(!std::is_integral<TT>::value || std::is_same<TT, bool>::value) &&
73  !std::is_floating_point<TT>::value,
74  int> = 0>
75  void validateIntegralOrFloatingPointValue(std::shared_ptr<parquet::Statistics> stats,
76  const SQLTypeInfo& column_type) const {
77  // do nothing when type `T` is non-integral and non-floating-point (case
78  // for which this can happen are when `T` is bool)
79  }
80 
81  template <typename TT = T, std::enable_if_t<std::is_floating_point<TT>::value, int> = 0>
82  void validateIntegralOrFloatingPointValue(std::shared_ptr<parquet::Statistics> stats,
83  const SQLTypeInfo& column_type) const {
84  auto [unencoded_stats_min, unencoded_stats_max] =
86  if (column_type.is_fp()) {
87  FloatPointValidator<T>::validateValue(unencoded_stats_max, column_type);
88  FloatPointValidator<T>::validateValue(unencoded_stats_min, column_type);
89  } else {
90  UNREACHABLE();
91  }
92  }
93 
94  template <
95  typename TT = T,
96  std::enable_if_t<std::is_integral<TT>::value && !std::is_same<TT, bool>::value,
97  int> = 0>
98  void validateIntegralOrFloatingPointValue(std::shared_ptr<parquet::Statistics> stats,
99  const SQLTypeInfo& column_type) const {
100  if (!column_type.is_integer() && !column_type.is_timestamp()) {
101  return;
102  }
103  auto [unencoded_stats_min, unencoded_stats_max] =
105  if (column_type.is_integer()) {
107  column_type);
109  column_type);
110  } else if (column_type.is_timestamp()) {
111  TimestampBoundsValidator<T>::validateValue(unencoded_stats_max, column_type);
112  TimestampBoundsValidator<T>::validateValue(unencoded_stats_min, column_type);
113  }
114  }
115 };
116 
117 // ParquetUnsignedFixedLengthEncoder is used in two separate use cases:
118 // metadata scanning & chunk loading. During metadata scan the type of
119 // metadata (& in some cases data) must be known, while during chunk loading
120 // only the type of data needs to be known.
121 //
122 // The following semantics apply to the templated types below.
123 //
124 // At metadata scan:
125 // V - type of metadata (for loading metadata)
126 // T - physical type of parquet data
127 // U - unsigned type that the parquet data represents
128 //
129 // At chunk load:
130 // V - type of data (to load data)
131 // T - physical type of parquet data
132 // U - unsigned type that the parquet data represents
133 template <typename V, typename T, typename U>
135  public ParquetMetadataValidator {
136  public:
139  const ColumnDescriptor* column_desciptor,
140  const parquet::ColumnDescriptor* parquet_column_descriptor)
141  : TypedParquetInPlaceEncoder<V, T>(buffer,
142  column_desciptor,
143  parquet_column_descriptor) {}
144 
146  const size_t omnisci_data_type_byte_size,
147  const size_t parquet_data_type_byte_size)
148  : TypedParquetInPlaceEncoder<V, T>(buffer,
149  omnisci_data_type_byte_size,
150  parquet_data_type_byte_size) {}
151 
152  void encodeAndCopy(const int8_t* parquet_data_bytes,
153  int8_t* omnisci_data_bytes) override {
154  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data_bytes)[0];
155  auto& omnisci_data_value = reinterpret_cast<V*>(omnisci_data_bytes)[0];
156  omnisci_data_value = static_cast<U>(parquet_data_value);
157  }
158 
159  void validate(std::shared_ptr<parquet::Statistics> stats,
160  const SQLTypeInfo& column_type) const override {
161  if (!column_type.is_integer()) { // do not validate non-integral types
162  return;
163  }
164  auto [unencoded_stats_min, unencoded_stats_max] =
167  column_type);
169  column_type);
170  }
171 };
172 
173 } // namespace foreign_storage
ParquetUnsignedFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
bool is_timestamp() const
Definition: sqltypes.h:742
bool is_fp() const
Definition: sqltypes.h:492
ParquetFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
#define UNREACHABLE()
Definition: Logger.h:241
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
ParquetUnsignedFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
ParquetFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
bool is_integer() const
Definition: sqltypes.h:490
An AbstractBuffer is a unit of data management for a data manager.
specifies the content in-memory of a row in the column metadata table
void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const override
void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const override
void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
std::pair< T, T > getUnencodedStats(std::shared_ptr< parquet::Statistics > stats) const
void validateIntegralOrFloatingPointValue(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)