OmniSciDB  a667adc9c8
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ParquetVariableLengthArrayEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <parquet/types.h>
20 
22 #include "ParquetArrayEncoder.h"
23 
24 namespace foreign_storage {
26  public:
28  Data_Namespace::AbstractBuffer* index_buffer,
29  std::shared_ptr<ParquetScalarEncoder> scalar_encoder,
30  const ColumnDescriptor* column_desciptor)
31  : ParquetArrayEncoder(data_buffer, scalar_encoder, column_desciptor)
32  , index_buffer_(index_buffer) {}
33 
34  void appendData(const int16_t* def_levels,
35  const int16_t* rep_levels,
36  const int64_t values_read,
37  const int64_t levels_read,
38  const bool is_last_batch,
39  int8_t* values) override {
40  CHECK(levels_read > 0);
41  setFirstOffsetForBuffer(def_levels[0]);
43  def_levels, rep_levels, values_read, levels_read, is_last_batch, values);
44  }
45 
46  protected:
47  void appendArraysToBuffer() override {
48  index_buffer_->append(reinterpret_cast<int8_t*>(offsets_.data()),
49  offsets_.size() * sizeof(ArrayOffsetT));
50  offsets_.clear();
52  }
53 
54  void processLastArray() override { appendLastArrayOffset(); }
55 
56  private:
57  void setFirstOffsetForBuffer(const int16_t def_level) {
58  if (data_buffer_bytes_.size() == 0 && buffer_->size() == 0) { // first element
60  // OmniSci variable array types have a special encoding for chunks in
61  // which the first array is null: the first `DEFAULT_NULL_PADDING_SIZE`
62  // bytes of the chunk are filled and the offset is set appropriately.
63  // Ostensibly, this is done to allow marking a null array by negating
64  // a non-zero value.
66  std::vector<int8_t> zero_bytes(ArrayNoneEncoder::DEFAULT_NULL_PADDING_SIZE, 0);
67  data_buffer_bytes_.insert(
68  data_buffer_bytes_.end(), zero_bytes.begin(), zero_bytes.end());
69  } else {
70  offsets_.push_back(0);
71  }
72  }
73  }
74 
76  int64_t last_offset = buffer_->size() + data_buffer_bytes_.size();
77  if (!isLastArrayNull()) {
78  // append array data offset
79  offsets_.push_back(last_offset);
80  } else {
81  // append a null array offset
82  offsets_.push_back(-last_offset);
83  }
84  }
85 
87  std::vector<ArrayOffsetT> offsets_;
88 };
89 } // namespace foreign_storage
ParquetVariableLengthArrayEncoder(Data_Namespace::AbstractBuffer *data_buffer, Data_Namespace::AbstractBuffer *index_buffer, std::shared_ptr< ParquetScalarEncoder > scalar_encoder, const ColumnDescriptor *column_desciptor)
An AbstractBuffer is a unit of data management for a data manager.
specifies the content in-memory of a row in the column metadata table
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
static constexpr size_t DEFAULT_NULL_PADDING_SIZE
int32_t ArrayOffsetT
Definition: sqltypes.h:937
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
#define CHECK(condition)
Definition: Logger.h:197
unencoded array encoder
Data_Namespace::AbstractBuffer * buffer_
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override