OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetVariableLengthArrayEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <parquet/types.h>
20 
22 #include "ParquetArrayEncoder.h"
23 
24 namespace foreign_storage {
26  public:
28  Data_Namespace::AbstractBuffer* index_buffer,
29  std::shared_ptr<ParquetScalarEncoder> scalar_encoder,
30  const ColumnDescriptor* column_desciptor)
31  : ParquetArrayEncoder(data_buffer, scalar_encoder, column_desciptor)
32  , index_buffer_(index_buffer) {}
33 
34  void appendData(const int16_t* def_levels,
35  const int16_t* rep_levels,
36  const int64_t values_read,
37  const int64_t levels_read,
38  int8_t* values) override {
39  CHECK(levels_read > 0);
40  setFirstOffsetForBuffer(def_levels[0]);
42  def_levels, rep_levels, values_read, levels_read, values);
43  }
44 
45  protected:
46  void appendArraysToBuffer() override {
47  index_buffer_->append(reinterpret_cast<int8_t*>(offsets_.data()),
48  offsets_.size() * sizeof(ArrayOffsetT));
49  offsets_.clear();
51  }
52 
53  void processLastArray() override {
56  }
57 
58  private:
59  void setFirstOffsetForBuffer(const int16_t def_level) {
60  if (data_buffer_bytes_.size() == 0 && buffer_->size() == 0) { // first element
63  // OmniSci variable array types have a special encoding for chunks in
64  // which the first array is null: the first `DEFAULT_NULL_PADDING_SIZE`
65  // bytes of the chunk are filled and the offset is set appropriately.
66  // Ostensibly, this is done to allow marking a null array by negating
67  // a non-zero value.
69  std::vector<int8_t> zero_bytes(ArrayNoneEncoder::DEFAULT_NULL_PADDING_SIZE, 0);
70  data_buffer_bytes_.insert(
71  data_buffer_bytes_.end(), zero_bytes.begin(), zero_bytes.end());
72  } else {
73  offsets_.push_back(0);
74  }
75  }
76  }
77 
79  int64_t last_offset = buffer_->size() + data_buffer_bytes_.size();
80  if (isLastArrayNull()) {
81  // append a null array offset
82  offsets_.push_back(-last_offset);
83  } else if (isLastArrayEmpty()) {
84  offsets_.push_back(last_offset);
85  } else {
86  // append array data offset
87  offsets_.push_back(last_offset);
88  }
89  }
90 
92  std::vector<ArrayOffsetT> offsets_;
93 };
94 } // namespace foreign_storage
ParquetVariableLengthArrayEncoder(Data_Namespace::AbstractBuffer *data_buffer, Data_Namespace::AbstractBuffer *index_buffer, std::shared_ptr< ParquetScalarEncoder > scalar_encoder, const ColumnDescriptor *column_desciptor)
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
An AbstractBuffer is a unit of data management for a data manager.
specifies the content in-memory of a row in the column metadata table
static constexpr size_t DEFAULT_NULL_PADDING_SIZE
int32_t ArrayOffsetT
Definition: sqltypes.h:1494
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
#define CHECK(condition)
Definition: Logger.h:291
unencoded array encoder
Data_Namespace::AbstractBuffer * buffer_