OmniSciDB  2e3a973ef4
foreign_storage::ParquetStringEncoder< V > Class Template Reference

#include <ParquetStringEncoder.h>

+ Inheritance diagram for foreign_storage::ParquetStringEncoder< V >:
+ Collaboration diagram for foreign_storage::ParquetStringEncoder< V >:

Public Member Functions

 ParquetStringEncoder (Data_Namespace::AbstractBuffer *buffer, StringDictionary *string_dictionary, std::unique_ptr< ChunkMetadata > &chunk_metadata)
 
void appendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
 
void encodeAndCopyContiguous (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
 
void encodeAndCopy (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
 
- Public Member Functions inherited from foreign_storage::TypedParquetInPlaceEncoder< V, V >
 TypedParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
 TypedParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
 
void appendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
 
void encodeAndCopyContiguous (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
 
void setNull (int8_t *omnisci_data_bytes) override
 
void copy (const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination) override
 
- Public Member Functions inherited from foreign_storage::ParquetInPlaceEncoder
 ParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
 
- Public Member Functions inherited from foreign_storage::ParquetScalarEncoder
 ParquetScalarEncoder (Data_Namespace::AbstractBuffer *buffer)
 
- Public Member Functions inherited from foreign_storage::ParquetEncoder
 ParquetEncoder (Data_Namespace::AbstractBuffer *buffer)
 
virtual ~ParquetEncoder ()=default
 

Protected Member Functions

bool encodingIsIdentityForSameTypes () const override
 

Private Member Functions

void updateMetadataStats (int64_t values_read, int8_t *values)
 

Private Attributes

StringDictionarystring_dictionary_
 
std::unique_ptr< ChunkMetadata > & chunk_metadata_
 
std::vector< int8_t > encode_buffer_
 
min_
 
max_
 

Additional Inherited Members

- Protected Attributes inherited from foreign_storage::ParquetInPlaceEncoder
const size_t omnisci_data_type_byte_size_
 
- Protected Attributes inherited from foreign_storage::ParquetEncoder
Data_Namespace::AbstractBufferbuffer_
 

Detailed Description

template<typename V>
class foreign_storage::ParquetStringEncoder< V >

Definition at line 29 of file ParquetStringEncoder.h.

Constructor & Destructor Documentation

◆ ParquetStringEncoder()

template<typename V >
foreign_storage::ParquetStringEncoder< V >::ParquetStringEncoder ( Data_Namespace::AbstractBuffer buffer,
StringDictionary string_dictionary,
std::unique_ptr< ChunkMetadata > &  chunk_metadata 
)
inline

Definition at line 31 of file ParquetStringEncoder.h.

34  : TypedParquetInPlaceEncoder<V, V>(buffer, sizeof(V), sizeof(V))
35  , string_dictionary_(string_dictionary)
36  , chunk_metadata_(chunk_metadata)
38  , min_(std::numeric_limits<V>::max())
39  , max_(std::numeric_limits<V>::lowest()) {}
std::unique_ptr< ChunkMetadata > & chunk_metadata_

Member Function Documentation

◆ appendData()

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::appendData ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
const bool  is_last_batch,
int8_t *  values 
)
inlineoverridevirtual

Appends Parquet data to the buffer using an in-place algorithm. Any necessary transformation or validation of the data and decoding of nulls is part of appending the data. Each class inheriting from this abstract class must implement the functionality to copy, nullify and encode the data.

Parameters
def_levels- an array containing the Dremel encoding definition levels
rep_levels- an array containing the Dremel encoding repetition levels
values_read- the number of non-null values read
levels_read- the total number of values (non-null & null) that are read
is_last_batch- flag indicating if this is the last read for the row group
values- values that are read

Note that the Parquet format encodes nulls using Dremel encoding.

Reimplemented from foreign_storage::ParquetInPlaceEncoder.

Definition at line 41 of file ParquetStringEncoder.h.

References foreign_storage::TypedParquetInPlaceEncoder< V, T >::appendData(), foreign_storage::ParquetStringEncoder< V >::encode_buffer_, and foreign_storage::ParquetStringEncoder< V >::encodeAndCopyContiguous().

46  {
47  encodeAndCopyContiguous(values, encode_buffer_.data(), values_read);
49  rep_levels,
50  values_read,
51  levels_read,
52  is_last_batch,
53  encode_buffer_.data());
54  }
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
void encodeAndCopyContiguous(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
+ Here is the call graph for this function:

◆ encodeAndCopy()

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::encodeAndCopy ( const int8_t *  parquet_data_bytes,
int8_t *  omnisci_data_bytes 
)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 73 of file ParquetStringEncoder.h.

References foreign_storage::TypedParquetInPlaceEncoder< V, T >::copy().

74  {
75  TypedParquetInPlaceEncoder<V, V>::copy(parquet_data_bytes, omnisci_data_bytes);
76  }
void copy(const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination) override
+ Here is the call graph for this function:

◆ encodeAndCopyContiguous()

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::encodeAndCopyContiguous ( const int8_t *  parquet_data_bytes,
int8_t *  omnisci_data_bytes,
const size_t  num_elements 
)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 56 of file ParquetStringEncoder.h.

References StringDictionary::getOrAddBulk(), foreign_storage::ParquetStringEncoder< V >::string_dictionary_, and foreign_storage::ParquetStringEncoder< V >::updateMetadataStats().

Referenced by foreign_storage::ParquetStringEncoder< V >::appendData().

58  {
59  auto parquet_data_ptr =
60  reinterpret_cast<const parquet::ByteArray*>(parquet_data_bytes);
61  auto omnisci_data_ptr = reinterpret_cast<V*>(omnisci_data_bytes);
62  std::vector<std::string_view> string_views;
63  string_views.reserve(num_elements);
64  for (size_t i = 0; i < num_elements; ++i) {
65  auto& byte_array = parquet_data_ptr[i];
66  string_views.emplace_back(reinterpret_cast<const char*>(byte_array.ptr),
67  byte_array.len);
68  }
69  string_dictionary_->getOrAddBulk(string_views, omnisci_data_ptr);
70  updateMetadataStats(num_elements, omnisci_data_bytes);
71  }
void updateMetadataStats(int64_t values_read, int8_t *values)
void getOrAddBulk(const std::vector< String > &string_vec, T *encoded_vec)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ encodingIsIdentityForSameTypes()

template<typename V >
bool foreign_storage::ParquetStringEncoder< V >::encodingIsIdentityForSameTypes ( ) const
inlineoverrideprotectedvirtual

Reimplemented from foreign_storage::TypedParquetInPlaceEncoder< V, V >.

Definition at line 79 of file ParquetStringEncoder.h.

79 { return true; }

◆ updateMetadataStats()

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::updateMetadataStats ( int64_t  values_read,
int8_t *  values 
)
inlineprivate

Definition at line 82 of file ParquetStringEncoder.h.

References foreign_storage::ParquetStringEncoder< V >::chunk_metadata_, foreign_storage::ParquetStringEncoder< V >::max_, and foreign_storage::ParquetStringEncoder< V >::min_.

Referenced by foreign_storage::ParquetStringEncoder< V >::encodeAndCopyContiguous().

82  {
83  V* data_ptr = reinterpret_cast<V*>(values);
84  for (int64_t i = 0; i < values_read; ++i) {
85  min_ = std::min<V>(data_ptr[i], min_);
86  max_ = std::max<V>(data_ptr[i], max_);
87  }
88  chunk_metadata_->fillChunkStats(min_, max_, false);
89  }
std::unique_ptr< ChunkMetadata > & chunk_metadata_
+ Here is the caller graph for this function:

Member Data Documentation

◆ chunk_metadata_

template<typename V >
std::unique_ptr<ChunkMetadata>& foreign_storage::ParquetStringEncoder< V >::chunk_metadata_
private

◆ encode_buffer_

template<typename V >
std::vector<int8_t> foreign_storage::ParquetStringEncoder< V >::encode_buffer_
private

◆ max_

template<typename V >
V foreign_storage::ParquetStringEncoder< V >::max_
private

◆ min_

template<typename V >
V foreign_storage::ParquetStringEncoder< V >::min_
private

◆ string_dictionary_

template<typename V >
StringDictionary* foreign_storage::ParquetStringEncoder< V >::string_dictionary_
private

The documentation for this class was generated from the following file: