OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetDetectStringEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
20 #include "LazyParquetChunkLoader.h"
21 #include "ParquetEncoder.h"
23 
24 #include <parquet/schema.h>
25 #include <parquet/types.h>
26 
27 namespace foreign_storage {
28 
30  public:
32  : ParquetScalarEncoder(buffer)
35  }
36 
37  void setNull(int8_t* omnisci_data_bytes) override { UNREACHABLE(); }
38  void copy(const int8_t* omnisci_data_bytes_source,
39  int8_t* omnisci_data_bytes_destination) override {
40  UNREACHABLE();
41  }
42 
43  void encodeAndCopy(const int8_t* parquet_data_bytes,
44  int8_t* omnisci_data_bytes) override {
45  UNREACHABLE();
46  }
47 
48  void encodeAndCopyContiguous(const int8_t* parquet_data_bytes,
49  int8_t* omnisci_data_bytes,
50  const size_t num_elements) override {
51  UNREACHABLE();
52  }
53 
54  void validate(const int8_t* parquet_data,
55  const int64_t j,
56  const SQLTypeInfo& column_type) const override {
57  auto parquet_data_ptr = reinterpret_cast<const parquet::ByteArray*>(parquet_data);
58  auto& byte_array = parquet_data_ptr[j];
59  if (byte_array.len > StringDictionary::MAX_STRLEN) {
60  throw ForeignStorageException("String exceeeds max length allowed in dictionary");
61  }
62  }
63 
64  void validateUsingEncodersColumnType(const int8_t* parquet_data,
65  const int64_t j) const override {
66  validate(parquet_data, j, column_type_);
67  }
68 
69  std::string encodedDataToString(const int8_t* bytes) const override {
70  UNREACHABLE();
71  return {};
72  }
73 
75  const InvalidRowGroupIndices& invalid_indices) override {
76  UNREACHABLE();
77  }
78 
79  void validateAndAppendData(const int16_t* def_levels,
80  const int16_t* rep_levels,
81  const int64_t values_read,
82  const int64_t levels_read,
83  int8_t* values,
84  const SQLTypeInfo& column_type, /* may not be used */
85  InvalidRowGroupIndices& invalid_indices) override {
86  UNREACHABLE();
87  }
88 
89  void appendData(const int16_t* def_levels,
90  const int16_t* rep_levels,
91  const int64_t values_read,
92  const int64_t levels_read,
93  int8_t* values) override {
94  CHECK(levels_read > 0);
95 
96  auto parquet_data_ptr = reinterpret_cast<const parquet::ByteArray*>(values);
97 
98  for (int64_t i = 0, j = 0; i < levels_read; ++i) {
99  if (def_levels[i]) {
100  CHECK(j < values_read);
101  auto& byte_array = parquet_data_ptr[j++];
104  i);
105  detect_buffer_->appendValue({}); // add empty string
106  } else {
107  auto string_value =
108  std::string{reinterpret_cast<const char*>(byte_array.ptr), byte_array.len};
109  detect_buffer_->appendValue(string_value);
110  }
111  } else {
112  detect_buffer_->appendValue("NULL");
113  }
114  }
117  }
118  }
119 
120  void appendDataTrackErrors(const int16_t* def_levels,
121  const int16_t* rep_levels,
122  const int64_t values_read,
123  const int64_t levels_read,
124  int8_t* values) override {
126  appendData(def_levels, rep_levels, values_read, levels_read, values);
127  }
128 
130 };
131 
132 } // namespace foreign_storage
void validateUsingEncodersColumnType(const int8_t *parquet_data, const int64_t j) const override
RejectedRowIndices invalid_indices_
ParquetDetectStringEncoder(Data_Namespace::AbstractBuffer *buffer)
#define UNREACHABLE()
Definition: Logger.h:338
void setNull(int8_t *omnisci_data_bytes) override
void encodeAndCopyContiguous(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
std::set< int64_t > InvalidRowGroupIndices
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
An AbstractBuffer is a unit of data management for a data manager.
void validate(const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const override
void validateAndAppendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values, const SQLTypeInfo &column_type, InvalidRowGroupIndices &invalid_indices) override
void copy(const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination) override
#define CHECK(condition)
Definition: Logger.h:291
std::string encodedDataToString(const int8_t *bytes) const override
static constexpr size_t MAX_STRLEN
void eraseInvalidIndicesInBuffer(const InvalidRowGroupIndices &invalid_indices) override
Data_Namespace::AbstractBuffer * buffer_
void appendDataTrackErrors(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override