OmniSciDB  85c2d10cdc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ParquetMetadataValidator.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "ParquetEncoder.h"
20 
21 namespace foreign_storage {
23  public:
24  virtual ~ParquetMetadataValidator() = default;
25 
26  virtual void validate(std::shared_ptr<parquet::Statistics> stats,
27  const SQLTypeInfo& column_type) const = 0;
28 };
29 
30 template <typename V, std::enable_if_t<std::is_integral<V>::value, int> = 0>
31 inline V get_null_value() {
32  return inline_int_null_value<V>();
33 }
34 
35 template <typename V, std::enable_if_t<std::is_floating_point<V>::value, int> = 0>
36 inline V get_null_value() {
37  return inline_fp_null_value<V>();
38 }
39 
40 template <typename D, std::enable_if_t<std::is_integral<D>::value, int> = 0>
41 inline std::pair<D, D> get_min_max_bounds() {
42  static_assert(std::is_signed<D>::value,
43  "'get_min_max_bounds' is only valid for signed types");
44  return {get_null_value<D>() + 1, std::numeric_limits<D>::max()};
45 }
46 
47 template <typename D, std::enable_if_t<std::is_floating_point<D>::value, int> = 0>
48 inline std::pair<D, D> get_min_max_bounds() {
49  return {std::numeric_limits<D>::lowest(), std::numeric_limits<D>::max()};
50 }
51 
52 template <typename D, typename T>
53 inline bool check_bounds(const T& value) {
54  auto [min_value, max_value] = get_min_max_bounds<D>();
55  return value >= min_value && value <= max_value;
56 }
57 
58 template <typename D>
59 inline std::string datetime_to_string(const D& timestamp,
60  const SQLTypeInfo& column_type) {
61  CHECK(column_type.is_timestamp() || column_type.is_date());
62  Datum d;
63  d.bigintval = timestamp;
64  return DatumToString(d, column_type);
65 }
66 
68  const std::string& min_value,
69  const std::string& max_value,
70  const std::string& encountered_value) {
71  std::stringstream error_message;
72  error_message << "Parquet column contains values that are outside the range of the "
73  "OmniSci column "
74  "type. Consider using a wider column type. Min allowed value: "
75  << min_value << ". Max allowed value: " << max_value
76  << ". Encountered value: " << encountered_value << ".";
77  throw std::runtime_error(error_message.str());
78 }
79 
80 template <typename T>
82  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
83  "TimestampBoundsValidator is only defined for signed integral types.");
84 
85  public:
86  template <typename D>
87  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
88  if (!valueWithinBounds(data_value, column_type)) {
89  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
91  min_allowed_value,
92  max_allowed_value,
93  datetime_to_string(data_value, column_type));
94  }
95  }
96 
97  private:
98  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
99  CHECK(column_type.is_timestamp());
100  switch (column_type.get_size()) {
101  case 4:
102  return check_bounds<int32_t>(value);
103  case 8:
104  return check_bounds<int64_t>(value);
105  default:
106  UNREACHABLE();
107  }
108  return {};
109  }
110 
111  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
112  const SQLTypeInfo& column_type) {
113  CHECK(column_type.is_timestamp());
114  switch (column_type.get_size()) {
115  case 4:
116  return getMinMaxBoundsAsStrings<int32_t>(column_type);
117  case 8:
118  return getMinMaxBoundsAsStrings<int64_t>(column_type);
119  default:
120  UNREACHABLE();
121  }
122  return {};
123  }
124 
125  template <typename D>
126  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
127  const SQLTypeInfo& column_type) {
128  auto [min_value, max_value] = get_min_max_bounds<D>();
129  return {datetime_to_string(min_value, column_type),
130  datetime_to_string(max_value, column_type)};
131  }
132 };
133 
134 template <typename T>
136  static_assert(std::is_integral<T>::value,
137  "IntegralFixedLengthBoundsValidator is only defined for integral types.");
138 
139  public:
140  template <typename D>
141  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
142  if (!valueWithinBounds(data_value, column_type)) {
143  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
144  if (std::is_signed<T>::value) {
146  min_allowed_value, max_allowed_value, std::to_string(data_value));
147  } else {
149  min_allowed_value,
150  max_allowed_value,
151  std::to_string(static_cast<T>(data_value)));
152  }
153  }
154  }
155 
156  private:
157  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
158  CHECK(column_type.is_integer());
159  switch (column_type.get_size()) {
160  case 1:
161  return checkBounds<int8_t>(value);
162  case 2:
163  return checkBounds<int16_t>(value);
164  case 4:
165  return checkBounds<int32_t>(value);
166  case 8:
167  return checkBounds<int64_t>(value);
168  default:
169  UNREACHABLE();
170  }
171  return {};
172  }
173 
174  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
175  const SQLTypeInfo& column_type) {
176  CHECK(column_type.is_integer());
177  switch (column_type.get_size()) {
178  case 1:
179  return getMinMaxBoundsAsStrings<int8_t>();
180  case 2:
181  return getMinMaxBoundsAsStrings<int16_t>();
182  case 4:
183  return getMinMaxBoundsAsStrings<int32_t>();
184  case 8:
185  return getMinMaxBoundsAsStrings<int64_t>();
186  default:
187  UNREACHABLE();
188  }
189  return {};
190  }
191 
199  template <typename D,
200  typename TT = T,
201  std::enable_if_t<std::is_signed<TT>::value, int> = 0>
202  static bool checkBounds(const T& value) {
203  return check_bounds<D>(value);
204  }
205 
213  template <typename D,
214  typename TT = T,
215  std::enable_if_t<!std::is_signed<TT>::value, int> = 0>
216  static bool checkBounds(const T& value) {
217  auto [min_value, max_value] = get_min_max_bounds<D>();
218  auto signed_value = static_cast<D>(value);
219  return signed_value >= 0 && signed_value <= max_value;
220  }
221 
222  template <typename D>
223  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings() {
224  auto [min_value, max_value] = get_min_max_bounds<D>();
225  return {std::to_string(min_value), std::to_string(max_value)};
226  }
227 };
228 
229 template <typename T>
231  static_assert(
232  std::is_integral<T>::value && std::is_signed<T>::value,
233  "DateInSecondsBoundsValidator is only defined for signed integral types.");
234 
235  public:
236  template <typename D>
237  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
238  if (!valueWithinBounds(data_value, column_type)) {
239  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
241  min_allowed_value,
242  max_allowed_value,
243  datetime_to_string(data_value, column_type));
244  }
245  }
246 
247  private:
248  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
249  CHECK(column_type.is_date());
250  switch (column_type.get_size()) {
251  case 4:
252  return checkBounds<int32_t>(value);
253  case 2:
254  return checkBounds<int16_t>(value);
255  default:
256  UNREACHABLE();
257  }
258  return {};
259  }
260 
261  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
262  const SQLTypeInfo& column_type) {
263  CHECK(column_type.is_date());
264  switch (column_type.get_size()) {
265  case 4:
266  return getMinMaxBoundsAsStrings<int32_t>(column_type);
267  case 2:
268  return getMinMaxBoundsAsStrings<int16_t>(column_type);
269  default:
270  UNREACHABLE();
271  }
272  return {};
273  }
274 
275  template <typename D>
276  static bool checkBounds(const T& value) {
277  auto [min_value, max_value] = get_min_max_bounds<D>();
278  return value >= kSecsPerDay * min_value && value <= kSecsPerDay * max_value;
279  }
280 
281  template <typename D>
282  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
283  const SQLTypeInfo& column_type) {
284  auto [min_value, max_value] = get_min_max_bounds<D>();
285  return {datetime_to_string(kSecsPerDay * min_value, column_type),
286  datetime_to_string(kSecsPerDay * max_value, column_type)};
287  }
288 };
289 
290 template <typename T>
292  static_assert(std::is_floating_point<T>::value,
293  "FloatPointValidator is only defined for floating point types.");
294 
295  public:
296  template <typename D>
297  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
298  if (!valueWithinBounds(data_value, column_type)) {
299  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
301  min_allowed_value, max_allowed_value, std::to_string(data_value));
302  }
303  }
304 
305  private:
306  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
307  CHECK(column_type.is_fp());
308  switch (column_type.get_size()) {
309  case 4:
310  return checkBounds<float>(value);
311  case 8:
312  return checkBounds<double>(value);
313  default:
314  UNREACHABLE();
315  }
316  return {};
317  }
318 
319  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
320  const SQLTypeInfo& column_type) {
321  CHECK(column_type.is_fp());
322  switch (column_type.get_size()) {
323  case 4:
324  return getMinMaxBoundsAsStrings<float>();
325  case 8:
326  return getMinMaxBoundsAsStrings<double>();
327  default:
328  UNREACHABLE();
329  }
330  return {};
331  }
332 
333  template <typename D>
334  static bool checkBounds(const T& value) {
335  return check_bounds<D>(value);
336  }
337 
338  template <typename D>
339  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings() {
340  auto [min_value, max_value] = get_min_max_bounds<D>();
341  return {std::to_string(min_value), std::to_string(max_value)};
342  }
343 };
344 
345 } // namespace foreign_storage
static constexpr int64_t kSecsPerDay
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
HOST DEVICE int get_size() const
Definition: sqltypes.h:324
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:356
bool is_timestamp() const
Definition: sqltypes.h:742
tuple d
Definition: test_fsi.py:9
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
bool is_fp() const
Definition: sqltypes.h:492
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
#define UNREACHABLE()
Definition: Logger.h:241
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
virtual void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const =0
void throw_parquet_metadata_out_of_bounds_error(const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
std::string to_string(char const *&&v)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings()
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings()
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
bool is_integer() const
Definition: sqltypes.h:490
bool check_bounds(const T &value)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
std::pair< D, D > get_min_max_bounds()
#define CHECK(condition)
Definition: Logger.h:197
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
std::string datetime_to_string(const D &timestamp, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
bool is_date() const
Definition: sqltypes.h:730
static bool checkBounds(const T &value)
Check bounds for value in signed case.
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)