OmniSciDB  95562058bd
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ParquetVariableLengthArrayEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <parquet/types.h>
20 #include "ParquetArrayEncoder.h"
21 
22 namespace foreign_storage {
24  public:
26  Data_Namespace::AbstractBuffer* index_buffer,
27  std::shared_ptr<ParquetScalarEncoder> scalar_encoder,
28  const ColumnDescriptor* column_desciptor)
29  : ParquetArrayEncoder(data_buffer, scalar_encoder, column_desciptor)
30  , index_buffer_(index_buffer) {}
31 
32  void appendData(const int16_t* def_levels,
33  const int16_t* rep_levels,
34  const int64_t values_read,
35  const int64_t levels_read,
36  const bool is_last_batch,
37  int8_t* values) override {
38  CHECK(levels_read > 0);
39  setFirstOffsetForBuffer(def_levels[0]);
41  def_levels, rep_levels, values_read, levels_read, is_last_batch, values);
42  }
43 
44  protected:
45  void appendArraysToBuffer() override {
46  index_buffer_->append(reinterpret_cast<int8_t*>(offsets_.data()),
47  offsets_.size() * sizeof(ArrayOffsetT));
48  offsets_.clear();
50  }
51 
52  void processLastArray() override { appendLastArrayOffset(); }
53 
54  private:
55  void setFirstOffsetForBuffer(const int16_t def_level) {
56  if (data_buffer_bytes_.size() == 0 && buffer_->size() == 0) { // first element
58  // OmniSci variable array types have a special encoding for chunks in
59  // which the first array is null: the first 8 bytes of the chunk are
60  // filled and the offset is set appropriately. Ostensibly, this is
61  // done to allow marking a null array by negating a non-zero value;
62  // however, the choice of 8 appears arbitrary.
63  offsets_.push_back(8);
64  std::vector<int8_t> zero_bytes(8, 0);
65  data_buffer_bytes_.insert(
66  data_buffer_bytes_.end(), zero_bytes.begin(), zero_bytes.end());
67  } else {
68  offsets_.push_back(0);
69  }
70  }
71  }
72 
74  int64_t last_offset = buffer_->size() + data_buffer_bytes_.size();
75  if (!isLastArrayNull()) {
76  // append array data offset
77  offsets_.push_back(last_offset);
78  } else {
79  // append a null array offset
80  offsets_.push_back(-last_offset);
81  }
82  }
83 
85  std::vector<ArrayOffsetT> offsets_;
86 };
87 } // namespace foreign_storage
ParquetVariableLengthArrayEncoder(Data_Namespace::AbstractBuffer *data_buffer, Data_Namespace::AbstractBuffer *index_buffer, std::shared_ptr< ParquetScalarEncoder > scalar_encoder, const ColumnDescriptor *column_desciptor)
An AbstractBuffer is a unit of data management for a data manager.
specifies the content in-memory of a row in the column metadata table
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
int32_t ArrayOffsetT
Definition: sqltypes.h:869
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
#define CHECK(condition)
Definition: Logger.h:197
Data_Namespace::AbstractBuffer * buffer_
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override