OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetArrayDetectEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <parquet/types.h>
20 
21 #include "ParquetArrayEncoder.h"
23 #include "Shared/StringTransform.h"
25 
26 namespace foreign_storage {
28  public:
30  std::shared_ptr<ParquetScalarEncoder> scalar_encoder,
31  const ColumnDescriptor* column_desciptor)
32  : ParquetArrayEncoder(data_buffer, scalar_encoder, column_desciptor)
33  , detect_buffer_(dynamic_cast<TypedParquetDetectBuffer*>(data_buffer))
35  dynamic_cast<ParquetDetectStringEncoder*>(scalar_encoder_.get())) {
37  }
38 
39  void appendArrayItem(const int64_t encoded_index) override {
40  if (!is_string_array_) {
41  auto string_value =
42  scalar_encoder_->encodedDataToString(encodedDataAtIndex(encoded_index));
43  array_string_.emplace_back(string_value);
44  } else {
45  CHECK_GT(string_buffer_.size(), static_cast<size_t>(encoded_index));
46  array_string_.emplace_back(string_buffer_[encoded_index]);
47  }
49  }
50 
51  protected:
52  void encodeAllValues(const int8_t* values, const int64_t values_read) override {
53  if (!is_string_array_) {
54  ParquetArrayEncoder::encodeAllValues(values, values_read);
55  } else { // string arrays are a special case that require special handling
56  string_buffer_.clear();
57  auto parquet_data_ptr = reinterpret_cast<const parquet::ByteArray*>(values);
58  for (int64_t i = 0; i < values_read; ++i) {
59  auto& byte_array = parquet_data_ptr[i];
60  auto string_value =
61  std::string{reinterpret_cast<const char*>(byte_array.ptr), byte_array.len};
62  string_buffer_.push_back(string_value);
63  }
64  }
65  }
66 
67  void appendArraysToBuffer() override {
68  // no-op as data is already written to buffer in `processLastArray`
69  }
70 
71  void processLastArray() override {
74  }
75 
76  private:
78  if (isLastArrayNull()) {
79  detect_buffer_->appendValue("NULL");
80  } else if (isLastArrayEmpty()) {
82  } else {
83  detect_buffer_->appendValue("{" + join(array_string_, ",") + "}");
84  array_string_.clear();
85  }
86  }
87 
89  const bool is_string_array_;
90  std::vector<std::string> array_string_;
91  std::vector<std::string> string_buffer_;
92 };
93 } // namespace foreign_storage
std::string join(T const &container, std::string const &delim)
void appendArrayItem(const int64_t encoded_index) override
virtual void encodeAllValues(const int8_t *values, const int64_t values_read)
#define CHECK_GT(x, y)
Definition: Logger.h:305
void encodeAllValues(const int8_t *values, const int64_t values_read) override
int8_t * encodedDataAtIndex(const size_t index)
An AbstractBuffer is a unit of data management for a data manager.
specifies the content in-memory of a row in the column metadata table
void updateMetadataForAppendedArrayItem(const int64_t encoded_index)
ParquetArrayDetectEncoder(Data_Namespace::AbstractBuffer *data_buffer, std::shared_ptr< ParquetScalarEncoder > scalar_encoder, const ColumnDescriptor *column_desciptor)
#define CHECK(condition)
Definition: Logger.h:291
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_