OmniSciDB  2e3a973ef4
ParquetStringEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "LazyParquetChunkLoader.h"
20 #include "ParquetInPlaceEncoder.h"
22 
23 #include <parquet/schema.h>
24 #include <parquet/types.h>
25 
26 namespace foreign_storage {
27 
28 template <typename V>
30  public:
32  StringDictionary* string_dictionary,
33  std::unique_ptr<ChunkMetadata>& chunk_metadata)
34  : TypedParquetInPlaceEncoder<V, V>(buffer, sizeof(V), sizeof(V))
35  , string_dictionary_(string_dictionary)
36  , chunk_metadata_(chunk_metadata)
37  , encode_buffer_(LazyParquetChunkLoader::batch_reader_num_elements * sizeof(V))
38  , min_(std::numeric_limits<V>::max())
39  , max_(std::numeric_limits<V>::lowest()) {}
40 
41  void appendData(const int16_t* def_levels,
42  const int16_t* rep_levels,
43  const int64_t values_read,
44  const int64_t levels_read,
45  const bool is_last_batch,
46  int8_t* values) override {
47  encodeAndCopyContiguous(values, encode_buffer_.data(), values_read);
49  rep_levels,
50  values_read,
51  levels_read,
52  is_last_batch,
53  encode_buffer_.data());
54  }
55 
56  void encodeAndCopyContiguous(const int8_t* parquet_data_bytes,
57  int8_t* omnisci_data_bytes,
58  const size_t num_elements) override {
59  auto parquet_data_ptr =
60  reinterpret_cast<const parquet::ByteArray*>(parquet_data_bytes);
61  auto omnisci_data_ptr = reinterpret_cast<V*>(omnisci_data_bytes);
62  std::vector<std::string_view> string_views;
63  string_views.reserve(num_elements);
64  for (size_t i = 0; i < num_elements; ++i) {
65  auto& byte_array = parquet_data_ptr[i];
66  string_views.emplace_back(reinterpret_cast<const char*>(byte_array.ptr),
67  byte_array.len);
68  }
69  string_dictionary_->getOrAddBulk(string_views, omnisci_data_ptr);
70  updateMetadataStats(num_elements, omnisci_data_bytes);
71  }
72 
73  void encodeAndCopy(const int8_t* parquet_data_bytes,
74  int8_t* omnisci_data_bytes) override {
75  TypedParquetInPlaceEncoder<V, V>::copy(parquet_data_bytes, omnisci_data_bytes);
76  }
77 
78  protected:
79  bool encodingIsIdentityForSameTypes() const override { return true; }
80 
81  private:
82  void updateMetadataStats(int64_t values_read, int8_t* values) {
83  V* data_ptr = reinterpret_cast<V*>(values);
84  for (int64_t i = 0; i < values_read; ++i) {
85  min_ = std::min<V>(data_ptr[i], min_);
86  max_ = std::max<V>(data_ptr[i], max_);
87  }
88  chunk_metadata_->fillChunkStats(min_, max_, false);
89  }
90 
92  std::unique_ptr<ChunkMetadata>& chunk_metadata_;
93  std::vector<int8_t> encode_buffer_;
94 
95  V min_, max_;
96 };
97 
98 } // namespace foreign_storage
bool encodingIsIdentityForSameTypes() const override
void updateMetadataStats(int64_t values_read, int8_t *values)
void copy(const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination) override
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
ParquetStringEncoder(Data_Namespace::AbstractBuffer *buffer, StringDictionary *string_dictionary, std::unique_ptr< ChunkMetadata > &chunk_metadata)
An AbstractBuffer is a unit of data management for a data manager.
std::unique_ptr< ChunkMetadata > & chunk_metadata_
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
void getOrAddBulk(const std::vector< String > &string_vec, T *encoded_vec)
void encodeAndCopyContiguous(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override