OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetFixedLengthEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "ParquetInPlaceEncoder.h"
20 
21 namespace foreign_storage {
22 
23 // ParquetFixedLengthEncoder is used in two separate use cases: metadata
24 // scanning & chunk loading. During metadata scan the type of metadata (& in
25 // some cases data) must be known, while during chunk loading only the type of
26 // data needs to be known.
27 //
28 // The following semantics apply to the templated types below.
29 //
30 // At metadata scan:
31 // V - type of metadata (for loading metadata)
32 // T - physical type of parquet data
33 //
34 // At chunk load:
35 // V - type of data (to load data)
36 // T - physical type of parquet data
37 // NullType - the type to use for encoding nulls
38 template <typename V, typename T, typename NullType = V>
41  public:
43  const ColumnDescriptor* column_desciptor,
44  const parquet::ColumnDescriptor* parquet_column_descriptor)
45  : TypedParquetInPlaceEncoder<V, T, NullType>(buffer,
46  column_desciptor,
47  parquet_column_descriptor) {}
48 
50  const size_t omnisci_data_type_byte_size,
51  const size_t parquet_data_type_byte_size)
52  : TypedParquetInPlaceEncoder<V, T, NullType>(buffer,
53  omnisci_data_type_byte_size,
54  parquet_data_type_byte_size) {}
55 
56  void encodeAndCopy(const int8_t* parquet_data_bytes,
57  int8_t* omnisci_data_bytes) override {
58  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data_bytes)[0];
59  auto& omnisci_data_value = reinterpret_cast<V*>(omnisci_data_bytes)[0];
60  omnisci_data_value = parquet_data_value;
61  }
62 
63  void validate(std::shared_ptr<parquet::Statistics> stats,
64  const SQLTypeInfo& column_type) const override {
65  validateIntegralOrFloatingPointMetadata(stats, column_type);
66  }
67 
68  void validate(const int8_t* parquet_data,
69  const int64_t j,
70  const SQLTypeInfo& column_type) const override {
71  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data)[j];
72  validateIntegralOrFloatingPointValue(parquet_data_value, column_type);
73  }
74 
75  bool encodingIsIdentityForSameTypes() const override { return true; }
76 
77  private:
78  template <
79  typename TT = T,
80  std::enable_if_t<(!std::is_integral<TT>::value || std::is_same<TT, bool>::value) &&
81  !std::is_floating_point<TT>::value,
82  int> = 0>
84  const SQLTypeInfo& column_type) const {
85  // do nothing when type `T` is non-integral and non-floating-point (case
86  // for which this can happen are when `T` is bool)
87  }
88 
89  template <typename TT = T, std::enable_if_t<std::is_floating_point<TT>::value, int> = 0>
91  const SQLTypeInfo& column_type) const {
92  if (column_type.is_fp()) {
93  FloatPointValidator<T>::validateValue(value, column_type);
94  } else {
95  UNREACHABLE();
96  }
97  }
98 
99  template <
100  typename TT = T,
101  std::enable_if_t<std::is_integral<TT>::value && !std::is_same<TT, bool>::value,
102  int> = 0>
104  const SQLTypeInfo& column_type) const {
105  if (column_type.is_integer()) {
107  } else if (column_type.is_timestamp()) {
108  TimestampBoundsValidator<T>::validateValue(value, column_type);
109  }
110  }
111 
112  void validateIntegralOrFloatingPointMetadata(std::shared_ptr<parquet::Statistics> stats,
113  const SQLTypeInfo& column_type) const {
114  if (!column_type.is_integer() && !column_type.is_timestamp() &&
115  !column_type.is_fp()) {
116  return;
117  }
118  auto [unencoded_stats_min, unencoded_stats_max] =
120  validateIntegralOrFloatingPointValue(unencoded_stats_min, column_type);
121  validateIntegralOrFloatingPointValue(unencoded_stats_max, column_type);
122  }
123 };
124 
125 // ParquetUnsignedFixedLengthEncoder is used in two separate use cases:
126 // metadata scanning & chunk loading. During metadata scan the type of
127 // metadata (& in some cases data) must be known, while during chunk loading
128 // only the type of data needs to be known.
129 //
130 // The following semantics apply to the templated types below.
131 //
132 // At metadata scan:
133 // V - type of metadata (for loading metadata)
134 // T - physical type of parquet data
135 // U - unsigned type that the parquet data represents
136 //
137 // At chunk load:
138 // V - type of data (to load data)
139 // T - physical type of parquet data
140 // U - unsigned type that the parquet data represents
141 // NullType - the type to use for encoding nulls
142 template <typename V, typename T, typename U, typename NullType = V>
144  : public TypedParquetInPlaceEncoder<V, T, NullType>,
145  public ParquetMetadataValidator {
146  public:
149  const ColumnDescriptor* column_desciptor,
150  const parquet::ColumnDescriptor* parquet_column_descriptor)
151  : TypedParquetInPlaceEncoder<V, T, NullType>(buffer,
152  column_desciptor,
153  parquet_column_descriptor) {}
154 
156  const size_t omnisci_data_type_byte_size,
157  const size_t parquet_data_type_byte_size)
158  : TypedParquetInPlaceEncoder<V, T, NullType>(buffer,
159  omnisci_data_type_byte_size,
160  parquet_data_type_byte_size) {}
161 
162  void encodeAndCopy(const int8_t* parquet_data_bytes,
163  int8_t* omnisci_data_bytes) override {
164  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data_bytes)[0];
165  auto& omnisci_data_value = reinterpret_cast<V*>(omnisci_data_bytes)[0];
166  omnisci_data_value = static_cast<U>(parquet_data_value);
167  }
168 
169  void validate(std::shared_ptr<parquet::Statistics> stats,
170  const SQLTypeInfo& column_type) const override {
171  if (!column_type.is_integer()) { // do not validate non-integral types
172  return;
173  }
174  auto [unencoded_stats_min, unencoded_stats_max] =
177  column_type);
179  column_type);
180  }
181 
182  void validate(const int8_t* parquet_data,
183  const int64_t j,
184  const SQLTypeInfo& column_type) const override {
185  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data)[j];
186  IntegralFixedLengthBoundsValidator<U>::validateValue(parquet_data_value, column_type);
187  }
188 };
189 
190 } // namespace foreign_storage
void validateIntegralOrFloatingPointValue(const T &value, const SQLTypeInfo &column_type) const
bool is_timestamp() const
Definition: sqltypes.h:880
bool is_fp() const
Definition: sqltypes.h:513
#define UNREACHABLE()
Definition: Logger.h:253
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
void validateIntegralOrFloatingPointMetadata(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const
void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const override
void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
ParquetFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
bool is_integer() const
Definition: sqltypes.h:511
ParquetUnsignedFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
An AbstractBuffer is a unit of data management for a data manager.
specifies the content in-memory of a row in the column metadata table
std::pair< T, T > getUnencodedStats(std::shared_ptr< parquet::Statistics > stats) const
void validate(const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const override
ParquetFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const override
void validate(const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const override
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
ParquetUnsignedFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)