OmniSciDB  0264ff685a
ParquetMetadataValidator.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "ParquetEncoder.h"
20 
21 namespace foreign_storage {
23  public:
24  virtual ~ParquetMetadataValidator() = default;
25 
26  virtual void validate(std::shared_ptr<parquet::Statistics> stats,
27  const SQLTypeInfo& column_type) const = 0;
28 };
29 
30 template <typename V, std::enable_if_t<std::is_integral<V>::value, int> = 0>
31 inline V get_null_value() {
32  return inline_int_null_value<V>();
33 }
34 
35 template <typename V, std::enable_if_t<std::is_floating_point<V>::value, int> = 0>
36 inline V get_null_value() {
37  return inline_fp_null_value<V>();
38 }
39 
40 template <typename D, std::enable_if_t<std::is_integral<D>::value, int> = 0>
41 inline std::pair<D, D> get_min_max_bounds() {
42  static_assert(std::is_signed<D>::value,
43  "'get_min_max_bounds' is only valid for signed types");
44  return {get_null_value<D>() + 1, std::numeric_limits<D>::max()};
45 }
46 
47 template <typename D, std::enable_if_t<std::is_floating_point<D>::value, int> = 0>
48 inline std::pair<D, D> get_min_max_bounds() {
49  return {std::numeric_limits<D>::lowest(), std::numeric_limits<D>::max()};
50 }
51 
52 template <typename D, typename T>
53 inline bool check_bounds(const T& value) {
54  auto [min_value, max_value] = get_min_max_bounds<D>();
55  return value >= min_value && value <= max_value;
56 }
57 
58 template <typename D>
59 inline std::string datetime_to_string(const D& timestamp,
60  const SQLTypeInfo& column_type) {
61  CHECK(column_type.is_timestamp() || column_type.is_date());
62  Datum d;
63  d.bigintval = timestamp;
64  return DatumToString(d, column_type);
65 }
66 
68  const std::string& min_value,
69  const std::string& max_value,
70  const std::string& encountered_value) {
71  std::stringstream error_message;
72  error_message << "Parquet column contains values that are outside the range of the "
73  "OmniSci column "
74  "type. Consider using a wider column type. Min allowed value: "
75  << min_value << ". Max allowed value: " << max_value
76  << ". Encountered value: " << encountered_value << ".";
77  throw std::runtime_error(error_message.str());
78 }
79 
80 template <typename T>
82  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
83  "TimestampBoundsValidator is only defined for signed integral types.");
84 
85  public:
86  template <typename D>
87  static void validateValue(const D& data_value,
88  const D& display_data_value,
89  const SQLTypeInfo& column_type) {
90  if (!valueWithinBounds(data_value, column_type)) {
91  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
93  min_allowed_value,
94  max_allowed_value,
95  datetime_to_string(display_data_value, column_type));
96  }
97  }
98 
99  template <typename D>
100  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
101  validateValue(data_value, data_value, column_type);
102  }
103 
104  private:
105  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
106  CHECK(column_type.is_timestamp());
107  switch (column_type.get_size()) {
108  case 4:
109  return check_bounds<int32_t>(value);
110  case 8:
111  return check_bounds<int64_t>(value);
112  default:
113  UNREACHABLE();
114  }
115  return {};
116  }
117 
118  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
119  const SQLTypeInfo& column_type) {
120  CHECK(column_type.is_timestamp());
121  switch (column_type.get_size()) {
122  case 4:
123  return getMinMaxBoundsAsStrings<int32_t>(column_type);
124  case 8:
125  return getMinMaxBoundsAsStrings<int64_t>(column_type);
126  default:
127  UNREACHABLE();
128  }
129  return {};
130  }
131 
132  template <typename D>
133  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
134  const SQLTypeInfo& column_type) {
135  auto [min_value, max_value] = get_min_max_bounds<D>();
136  return {datetime_to_string(min_value, column_type),
137  datetime_to_string(max_value, column_type)};
138  }
139 };
140 
141 template <typename T>
143  static_assert(std::is_integral<T>::value,
144  "IntegralFixedLengthBoundsValidator is only defined for integral types.");
145 
146  public:
147  template <typename D>
148  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
149  if (!valueWithinBounds(data_value, column_type)) {
150  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
151  if (std::is_signed<T>::value) {
153  min_allowed_value, max_allowed_value, std::to_string(data_value));
154  } else {
156  min_allowed_value,
157  max_allowed_value,
158  std::to_string(static_cast<T>(data_value)));
159  }
160  }
161  }
162 
163  private:
164  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
165  CHECK(column_type.is_integer());
166  switch (column_type.get_size()) {
167  case 1:
168  return checkBounds<int8_t>(value);
169  case 2:
170  return checkBounds<int16_t>(value);
171  case 4:
172  return checkBounds<int32_t>(value);
173  case 8:
174  return checkBounds<int64_t>(value);
175  default:
176  UNREACHABLE();
177  }
178  return {};
179  }
180 
181  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
182  const SQLTypeInfo& column_type) {
183  CHECK(column_type.is_integer());
184  switch (column_type.get_size()) {
185  case 1:
186  return getMinMaxBoundsAsStrings<int8_t>();
187  case 2:
188  return getMinMaxBoundsAsStrings<int16_t>();
189  case 4:
190  return getMinMaxBoundsAsStrings<int32_t>();
191  case 8:
192  return getMinMaxBoundsAsStrings<int64_t>();
193  default:
194  UNREACHABLE();
195  }
196  return {};
197  }
198 
206  template <typename D,
207  typename TT = T,
208  std::enable_if_t<std::is_signed<TT>::value, int> = 0>
209  static bool checkBounds(const T& value) {
210  return check_bounds<D>(value);
211  }
212 
220  template <typename D,
221  typename TT = T,
222  std::enable_if_t<!std::is_signed<TT>::value, int> = 0>
223  static bool checkBounds(const T& value) {
224  auto [min_value, max_value] = get_min_max_bounds<D>();
225  auto signed_value = static_cast<D>(value);
226  return signed_value >= 0 && signed_value <= max_value;
227  }
228 
229  template <typename D>
230  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings() {
231  auto [min_value, max_value] = get_min_max_bounds<D>();
232  return {std::to_string(min_value), std::to_string(max_value)};
233  }
234 };
235 
236 template <typename T>
238  static_assert(
239  std::is_integral<T>::value && std::is_signed<T>::value,
240  "DateInSecondsBoundsValidator is only defined for signed integral types.");
241 
242  public:
243  template <typename D>
244  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
245  if (!valueWithinBounds(data_value, column_type)) {
246  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
248  min_allowed_value,
249  max_allowed_value,
250  datetime_to_string(data_value, column_type));
251  }
252  }
253 
254  private:
255  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
256  CHECK(column_type.is_date());
257  switch (column_type.get_size()) {
258  case 4:
259  return checkBounds<int32_t>(value);
260  case 2:
261  return checkBounds<int16_t>(value);
262  default:
263  UNREACHABLE();
264  }
265  return {};
266  }
267 
268  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
269  const SQLTypeInfo& column_type) {
270  CHECK(column_type.is_date());
271  switch (column_type.get_size()) {
272  case 4:
273  return getMinMaxBoundsAsStrings<int32_t>(column_type);
274  case 2:
275  return getMinMaxBoundsAsStrings<int16_t>(column_type);
276  default:
277  UNREACHABLE();
278  }
279  return {};
280  }
281 
282  template <typename D>
283  static bool checkBounds(const T& value) {
284  auto [min_value, max_value] = get_min_max_bounds<D>();
285  return value >= kSecsPerDay * min_value && value <= kSecsPerDay * max_value;
286  }
287 
288  template <typename D>
289  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
290  const SQLTypeInfo& column_type) {
291  auto [min_value, max_value] = get_min_max_bounds<D>();
292  return {datetime_to_string(kSecsPerDay * min_value, column_type),
293  datetime_to_string(kSecsPerDay * max_value, column_type)};
294  }
295 };
296 
297 template <typename T>
299  static_assert(std::is_floating_point<T>::value,
300  "FloatPointValidator is only defined for floating point types.");
301 
302  public:
303  template <typename D>
304  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
305  if (!valueWithinBounds(data_value, column_type)) {
306  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
308  min_allowed_value, max_allowed_value, std::to_string(data_value));
309  }
310  }
311 
312  private:
313  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
314  CHECK(column_type.is_fp());
315  switch (column_type.get_size()) {
316  case 4:
317  return checkBounds<float>(value);
318  case 8:
319  return checkBounds<double>(value);
320  default:
321  UNREACHABLE();
322  }
323  return {};
324  }
325 
326  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
327  const SQLTypeInfo& column_type) {
328  CHECK(column_type.is_fp());
329  switch (column_type.get_size()) {
330  case 4:
331  return getMinMaxBoundsAsStrings<float>();
332  case 8:
333  return getMinMaxBoundsAsStrings<double>();
334  default:
335  UNREACHABLE();
336  }
337  return {};
338  }
339 
340  template <typename D>
341  static bool checkBounds(const T& value) {
342  return check_bounds<D>(value);
343  }
344 
345  template <typename D>
346  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings() {
347  auto [min_value, max_value] = get_min_max_bounds<D>();
348  return {std::to_string(min_value), std::to_string(max_value)};
349  }
350 };
351 
352 } // namespace foreign_storage
static constexpr int64_t kSecsPerDay
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:240
bool is_integer() const
Definition: sqltypes.h:480
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
#define UNREACHABLE()
Definition: Logger.h:241
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
bool is_date() const
Definition: sqltypes.h:715
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
HOST DEVICE int get_size() const
Definition: sqltypes.h:321
virtual void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const =0
void throw_parquet_metadata_out_of_bounds_error(const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
std::string to_string(char const *&&v)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings()
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings()
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
bool check_bounds(const T &value)
int64_t bigintval
Definition: sqltypes.h:206
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
std::pair< D, D > get_min_max_bounds()
static void validateValue(const D &data_value, const D &display_data_value, const SQLTypeInfo &column_type)
#define CHECK(condition)
Definition: Logger.h:197
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
std::string datetime_to_string(const D &timestamp, const SQLTypeInfo &column_type)
bool is_timestamp() const
Definition: sqltypes.h:727
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool checkBounds(const T &value)
Check bounds for value in signed case.
bool is_fp() const
Definition: sqltypes.h:482
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)