OmniSciDB  471d68cefb
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetMetadataValidator.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "ParquetEncoder.h"
21 
22 namespace foreign_storage {
24  public:
25  virtual ~ParquetMetadataValidator() = default;
26 
27  virtual void validate(std::shared_ptr<parquet::Statistics> stats,
28  const SQLTypeInfo& column_type) const = 0;
29 };
30 
31 template <typename D, typename T>
32 inline bool check_bounds(const T& value) {
33  auto [min_value, max_value] = get_min_max_bounds<D>();
34  return value >= min_value && value <= max_value;
35 }
36 
37 template <typename D>
38 inline std::string datetime_to_string(const D& timestamp,
39  const SQLTypeInfo& column_type) {
40  CHECK(column_type.is_timestamp() || column_type.is_date());
41  Datum d;
42  d.bigintval = timestamp;
43  return DatumToString(d, column_type);
44 }
45 
47  const std::string& min_value,
48  const std::string& max_value,
49  const std::string& encountered_value) {
50  std::stringstream error_message;
51  error_message << "Parquet column contains values that are outside the range of the "
52  "OmniSci column "
53  "type. Consider using a wider column type. Min allowed value: "
54  << min_value << ". Max allowed value: " << max_value
55  << ". Encountered value: " << encountered_value << ".";
56  throw std::runtime_error(error_message.str());
57 }
58 
59 template <typename T>
61  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
62  "TimestampBoundsValidator is only defined for signed integral types.");
63 
64  public:
65  template <typename D>
66  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
67  if (!valueWithinBounds(data_value, column_type)) {
68  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
70  min_allowed_value,
71  max_allowed_value,
72  datetime_to_string(data_value, column_type));
73  }
74  }
75 
76  private:
77  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
78  CHECK(column_type.is_timestamp());
79  switch (column_type.get_size()) {
80  case 4:
81  return check_bounds<int32_t>(value);
82  case 8:
83  return check_bounds<int64_t>(value);
84  default:
85  UNREACHABLE();
86  }
87  return {};
88  }
89 
90  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
91  const SQLTypeInfo& column_type) {
92  CHECK(column_type.is_timestamp());
93  switch (column_type.get_size()) {
94  case 4:
95  return getMinMaxBoundsAsStrings<int32_t>(column_type);
96  case 8:
97  return getMinMaxBoundsAsStrings<int64_t>(column_type);
98  default:
99  UNREACHABLE();
100  }
101  return {};
102  }
103 
104  template <typename D>
105  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
106  const SQLTypeInfo& column_type) {
107  auto [min_value, max_value] = get_min_max_bounds<D>();
108  return {datetime_to_string(min_value, column_type),
109  datetime_to_string(max_value, column_type)};
110  }
111 };
112 
113 template <typename T>
115  static_assert(std::is_integral<T>::value,
116  "IntegralFixedLengthBoundsValidator is only defined for integral types.");
117 
118  public:
119  template <typename D>
120  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
121  if (!valueWithinBounds(data_value, column_type)) {
122  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
123  if (std::is_signed<T>::value) {
125  min_allowed_value, max_allowed_value, std::to_string(data_value));
126  } else {
128  min_allowed_value,
129  max_allowed_value,
130  std::to_string(static_cast<T>(data_value)));
131  }
132  }
133  }
134 
135  private:
136  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
137  CHECK(column_type.is_integer());
138  switch (column_type.get_size()) {
139  case 1:
140  return checkBounds<int8_t>(value);
141  case 2:
142  return checkBounds<int16_t>(value);
143  case 4:
144  return checkBounds<int32_t>(value);
145  case 8:
146  return checkBounds<int64_t>(value);
147  default:
148  UNREACHABLE();
149  }
150  return {};
151  }
152 
153  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
154  const SQLTypeInfo& column_type) {
155  CHECK(column_type.is_integer());
156  switch (column_type.get_size()) {
157  case 1:
158  return getMinMaxBoundsAsStrings<int8_t>();
159  case 2:
160  return getMinMaxBoundsAsStrings<int16_t>();
161  case 4:
162  return getMinMaxBoundsAsStrings<int32_t>();
163  case 8:
164  return getMinMaxBoundsAsStrings<int64_t>();
165  default:
166  UNREACHABLE();
167  }
168  return {};
169  }
170 
178  template <typename D,
179  typename TT = T,
180  std::enable_if_t<std::is_signed<TT>::value, int> = 0>
181  static bool checkBounds(const T& value) {
182  return check_bounds<D>(value);
183  }
184 
192  template <typename D,
193  typename TT = T,
194  std::enable_if_t<!std::is_signed<TT>::value, int> = 0>
195  static bool checkBounds(const T& value) {
196  auto [min_value, max_value] = get_min_max_bounds<D>();
197  auto signed_value = static_cast<D>(value);
198  return signed_value >= 0 && signed_value <= max_value;
199  }
200 
201  template <typename D>
202  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings() {
203  auto [min_value, max_value] = get_min_max_bounds<D>();
204  return {std::to_string(min_value), std::to_string(max_value)};
205  }
206 };
207 
208 template <typename T>
210  static_assert(
211  std::is_integral<T>::value && std::is_signed<T>::value,
212  "DateInSecondsBoundsValidator is only defined for signed integral types.");
213 
214  public:
215  template <typename D>
216  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
217  if (!valueWithinBounds(data_value, column_type)) {
218  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
220  min_allowed_value,
221  max_allowed_value,
222  datetime_to_string(data_value, column_type));
223  }
224  }
225 
226  private:
227  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
228  CHECK(column_type.is_date());
229  switch (column_type.get_size()) {
230  case 4:
231  return checkBounds<int32_t>(value);
232  case 2:
233  return checkBounds<int16_t>(value);
234  default:
235  UNREACHABLE();
236  }
237  return {};
238  }
239 
240  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
241  const SQLTypeInfo& column_type) {
242  CHECK(column_type.is_date());
243  switch (column_type.get_size()) {
244  case 4:
245  return getMinMaxBoundsAsStrings<int32_t>(column_type);
246  case 2:
247  return getMinMaxBoundsAsStrings<int16_t>(column_type);
248  default:
249  UNREACHABLE();
250  }
251  return {};
252  }
253 
254  template <typename D>
255  static bool checkBounds(const T& value) {
256  auto [min_value, max_value] = get_min_max_bounds<D>();
257  return value >= kSecsPerDay * min_value && value <= kSecsPerDay * max_value;
258  }
259 
260  template <typename D>
261  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
262  const SQLTypeInfo& column_type) {
263  auto [min_value, max_value] = get_min_max_bounds<D>();
264  return {datetime_to_string(kSecsPerDay * min_value, column_type),
265  datetime_to_string(kSecsPerDay * max_value, column_type)};
266  }
267 };
268 
269 template <typename T>
271  static_assert(std::is_floating_point<T>::value,
272  "FloatPointValidator is only defined for floating point types.");
273 
274  public:
275  template <typename D>
276  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
277  if (!valueWithinBounds(data_value, column_type)) {
278  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
280  min_allowed_value, max_allowed_value, std::to_string(data_value));
281  }
282  }
283 
284  private:
285  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
286  CHECK(column_type.is_fp());
287  switch (column_type.get_size()) {
288  case 4:
289  return checkBounds<float>(value);
290  case 8:
291  return checkBounds<double>(value);
292  default:
293  UNREACHABLE();
294  }
295  return {};
296  }
297 
298  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
299  const SQLTypeInfo& column_type) {
300  CHECK(column_type.is_fp());
301  switch (column_type.get_size()) {
302  case 4:
303  return getMinMaxBoundsAsStrings<float>();
304  case 8:
305  return getMinMaxBoundsAsStrings<double>();
306  default:
307  UNREACHABLE();
308  }
309  return {};
310  }
311 
312  template <typename D>
313  static bool checkBounds(const T& value) {
314  return check_bounds<D>(value);
315  }
316 
317  template <typename D>
318  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings() {
319  auto [min_value, max_value] = get_min_max_bounds<D>();
320  return {std::to_string(min_value), std::to_string(max_value)};
321  }
322 };
323 
324 } // namespace foreign_storage
static constexpr int64_t kSecsPerDay
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:392
bool is_timestamp() const
Definition: sqltypes.h:880
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
bool is_fp() const
Definition: sqltypes.h:513
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
#define UNREACHABLE()
Definition: Logger.h:253
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
virtual void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const =0
void throw_parquet_metadata_out_of_bounds_error(const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
std::string to_string(char const *&&v)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings()
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings()
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
bool is_integer() const
Definition: sqltypes.h:511
bool check_bounds(const T &value)
int64_t bigintval
Definition: sqltypes.h:215
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
#define CHECK(condition)
Definition: Logger.h:209
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
std::string datetime_to_string(const D &timestamp, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
bool is_date() const
Definition: sqltypes.h:868
static bool checkBounds(const T &value)
Check bounds for value in signed case.
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)