OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::ParquetArrayEncoder Class Reference

#include <ParquetArrayEncoder.h>

+ Inheritance diagram for foreign_storage::ParquetArrayEncoder:
+ Collaboration diagram for foreign_storage::ParquetArrayEncoder:

Public Member Functions

 ParquetArrayEncoder (Data_Namespace::AbstractBuffer *data_buffer, std::shared_ptr< ParquetScalarEncoder > scalar_encoder, const ColumnDescriptor *column_desciptor)
 
void appendDataTrackErrors (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void appendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void finalizeRowGroup ()
 
std::shared_ptr< ChunkMetadatagetRowGroupMetadata (const parquet::RowGroupMetaData *group_metadata, const int parquet_column_index, const SQLTypeInfo &column_type) override
 
virtual void disableMetadataStatsValidation () override
 
virtual void initializeErrorTracking (const SQLTypeInfo &column_type) override
 
- Public Member Functions inherited from foreign_storage::ParquetEncoder
 ParquetEncoder (Data_Namespace::AbstractBuffer *buffer)
 
virtual ~ParquetEncoder ()=default
 
RejectedRowIndices getRejectedRowIndices () const
 

Protected Member Functions

virtual void processLastArray ()
 
virtual void appendArraysToBuffer ()
 
bool isLastArrayNull () const
 
bool isLastArrayEmpty () const
 
size_t sizeOfLastArray () const
 
int8_t * resizeArrayDataBytes (const size_t additional_num_elements)
 
virtual void resetLastArrayMetadata ()
 
bool isNewArray (const int16_t rep_level) const
 
int8_t * encodedDataAtIndex (const size_t index)
 
void updateMetadataForAppendedArrayItem (const int64_t encoded_index)
 
virtual void appendArrayItem (const int64_t encoded_index)
 
virtual void encodeAllValues (const int8_t *values, const int64_t values_read)
 

Protected Attributes

size_t omnisci_data_type_byte_size_
 
std::shared_ptr
< ParquetScalarEncoder
scalar_encoder_
 
std::vector< int8_t > data_buffer_bytes_
 
- Protected Attributes inherited from foreign_storage::ParquetEncoder
Data_Namespace::AbstractBufferbuffer_
 
bool is_error_tracking_enabled_
 
RejectedRowIndices invalid_indices_
 
size_t current_chunk_offset_
 
SQLTypeInfo column_type_
 
bool validate_metadata_stats_
 

Static Protected Attributes

static const int16_t non_null_def_level = 3
 
static const int16_t item_null_def_level = 2
 
static const int16_t empty_list_def_level = 1
 
static const int16_t list_null_def_level = 0
 

Private Member Functions

void processArrayItem (const int16_t def_level, int64_t &encoded_index)
 
void markArrayAsNull ()
 
void markArrayAsEmpty ()
 
void appendNullArrayItem ()
 

Private Attributes

std::vector< int8_t > encode_buffer_
 
bool has_assembly_started_
 
bool is_null_array_
 
bool is_empty_array_
 
size_t num_elements_in_array_
 
size_t num_array_assembled_
 
bool is_invalid_array_
 
std::vector< bool > is_valid_item_
 

Additional Inherited Members

- Static Protected Member Functions inherited from foreign_storage::ParquetEncoder
static std::shared_ptr
< ChunkMetadata
createMetadata (const SQLTypeInfo &column_type)
 
static void throwNotNullViolation (const std::string &parquet_column_name)
 
static void validateNullCount (const std::string &parquet_column_name, int64_t null_count, const SQLTypeInfo &column_type)
 

Detailed Description

Definition at line 23 of file ParquetArrayEncoder.h.

Constructor & Destructor Documentation

foreign_storage::ParquetArrayEncoder::ParquetArrayEncoder ( Data_Namespace::AbstractBuffer data_buffer,
std::shared_ptr< ParquetScalarEncoder scalar_encoder,
const ColumnDescriptor column_desciptor 
)
inline

Definition at line 25 of file ParquetArrayEncoder.h.

28  : ParquetEncoder(data_buffer)
30  column_desciptor->columnType.get_elem_type().get_size())
31  , scalar_encoder_(scalar_encoder)
32  , has_assembly_started_(false)
33  , is_null_array_(false)
34  , is_empty_array_(false)
37  , is_invalid_array_(false) {}
ParquetEncoder(Data_Namespace::AbstractBuffer *buffer)
HOST DEVICE int get_size() const
Definition: sqltypes.h:393
SQLTypeInfo columnType
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:963
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

Member Function Documentation

virtual void foreign_storage::ParquetArrayEncoder::appendArrayItem ( const int64_t  encoded_index)
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayImportEncoder, and foreign_storage::ParquetArrayDetectEncoder.

Definition at line 166 of file ParquetArrayEncoder.h.

References encodedDataAtIndex(), resizeArrayDataBytes(), scalar_encoder_, and updateMetadataForAppendedArrayItem().

Referenced by foreign_storage::ParquetArrayImportEncoder::appendArrayItem(), and processArrayItem().

166  {
167  auto omnisci_data_ptr = resizeArrayDataBytes(1);
168  scalar_encoder_->copy(encodedDataAtIndex(encoded_index), omnisci_data_ptr);
169  updateMetadataForAppendedArrayItem(encoded_index);
170  }
int8_t * resizeArrayDataBytes(const size_t additional_num_elements)
int8_t * encodedDataAtIndex(const size_t index)
void updateMetadataForAppendedArrayItem(const int64_t encoded_index)
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::appendArraysToBuffer ( )
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayImportEncoder, foreign_storage::ParquetArrayDetectEncoder, and foreign_storage::ParquetVariableLengthArrayEncoder.

Definition at line 114 of file ParquetArrayEncoder.h.

References Data_Namespace::AbstractBuffer::append(), foreign_storage::ParquetEncoder::buffer_, and data_buffer_bytes_.

Referenced by foreign_storage::ParquetVariableLengthArrayEncoder::appendArraysToBuffer(), and finalizeRowGroup().

114  {
116  data_buffer_bytes_.clear();
117  }
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
Data_Namespace::AbstractBuffer * buffer_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::appendData ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values 
)
inlineoverridevirtual

Implements foreign_storage::ParquetEncoder.

Reimplemented in foreign_storage::ParquetVariableLengthArrayEncoder.

Definition at line 57 of file ParquetArrayEncoder.h.

References CHECK, encodeAllValues(), isNewArray(), processArrayItem(), processLastArray(), and resetLastArrayMetadata().

Referenced by foreign_storage::ParquetVariableLengthArrayEncoder::appendData(), appendDataTrackErrors(), and foreign_storage::ParquetArrayImportEncoder::validateAndAppendData().

61  {
62  CHECK(levels_read > 0);
63 
64  // encode all values in the temporary in-memory `encode_buffer_`, doing
65  // this encoding as a batch rather than element-wise exposes opportunities
66  // for performance optimization for certain scalar types
67  encodeAllValues(values, values_read);
68 
69  for (int64_t i = 0, j = 0; i < levels_read; ++i) {
70  if (isNewArray(rep_levels[i])) {
73  }
74  processArrayItem(def_levels[i], j);
75  }
76  }
void processArrayItem(const int16_t def_level, int64_t &encoded_index)
virtual void encodeAllValues(const int8_t *values, const int64_t values_read)
#define CHECK(condition)
Definition: Logger.h:291
bool isNewArray(const int16_t rep_level) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::appendDataTrackErrors ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values 
)
inlineoverridevirtual

Implements foreign_storage::ParquetEncoder.

Definition at line 39 of file ParquetArrayEncoder.h.

References appendData(), CHECK, foreign_storage::ParquetEncoder::is_error_tracking_enabled_, is_valid_item_, and scalar_encoder_.

43  {
45  // validate all elements
46  is_valid_item_.assign(values_read, true);
47  for (int64_t j = 0; j < values_read; ++j) {
48  try {
49  scalar_encoder_->validateUsingEncodersColumnType(values, j);
50  } catch (const std::runtime_error& error) {
51  is_valid_item_[j] = false;
52  }
53  }
54  appendData(def_levels, rep_levels, values_read, levels_read, values);
55  }
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
#define CHECK(condition)
Definition: Logger.h:291
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

void foreign_storage::ParquetArrayEncoder::appendNullArrayItem ( )
inlineprivate

Definition at line 199 of file ParquetArrayEncoder.h.

References num_elements_in_array_, resizeArrayDataBytes(), and scalar_encoder_.

Referenced by processArrayItem().

199  {
202  }
int8_t * resizeArrayDataBytes(const size_t additional_num_elements)
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::disableMetadataStatsValidation ( )
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Definition at line 95 of file ParquetArrayEncoder.h.

References foreign_storage::ParquetEncoder::disableMetadataStatsValidation(), and scalar_encoder_.

95  {
97  scalar_encoder_->disableMetadataStatsValidation();
98  }
virtual void disableMetadataStatsValidation()
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::encodeAllValues ( const int8_t *  values,
const int64_t  values_read 
)
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayDetectEncoder.

Definition at line 172 of file ParquetArrayEncoder.h.

References encode_buffer_, omnisci_data_type_byte_size_, and scalar_encoder_.

Referenced by appendData(), and foreign_storage::ParquetArrayDetectEncoder::encodeAllValues().

172  {
173  encode_buffer_.resize(values_read * omnisci_data_type_byte_size_);
174  scalar_encoder_->encodeAndCopyContiguous(values, encode_buffer_.data(), values_read);
175  }
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the caller graph for this function:

int8_t* foreign_storage::ParquetArrayEncoder::encodedDataAtIndex ( const size_t  index)
inlineprotected

Definition at line 155 of file ParquetArrayEncoder.h.

References encode_buffer_, and omnisci_data_type_byte_size_.

Referenced by foreign_storage::ParquetArrayDetectEncoder::appendArrayItem(), and appendArrayItem().

155  {
156  return encode_buffer_.data() + (index)*omnisci_data_type_byte_size_;
157  }

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::finalizeRowGroup ( )
inline

Definition at line 78 of file ParquetArrayEncoder.h.

References appendArraysToBuffer(), has_assembly_started_, processLastArray(), and resetLastArrayMetadata().

+ Here is the call graph for this function:

std::shared_ptr<ChunkMetadata> foreign_storage::ParquetArrayEncoder::getRowGroupMetadata ( const parquet::RowGroupMetaData *  group_metadata,
const int  parquet_column_index,
const SQLTypeInfo column_type 
)
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Reimplemented in foreign_storage::ParquetFixedLengthArrayEncoder.

Definition at line 85 of file ParquetArrayEncoder.h.

References scalar_encoder_.

Referenced by foreign_storage::ParquetFixedLengthArrayEncoder::getRowGroupMetadata().

88  {
89  auto metadata = scalar_encoder_->getRowGroupMetadata(
90  group_metadata, parquet_column_index, column_type);
91  metadata->numBytes = 0; // number of bytes is not known
92  return metadata;
93  }
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::initializeErrorTracking ( const SQLTypeInfo column_type)
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Definition at line 100 of file ParquetArrayEncoder.h.

References SQLTypeInfo::get_elem_type(), foreign_storage::ParquetEncoder::initializeErrorTracking(), and scalar_encoder_.

100  {
102  scalar_encoder_->initializeErrorTracking(column_type.get_elem_type());
103  }
virtual void initializeErrorTracking(const SQLTypeInfo &column_type)
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:963
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

bool foreign_storage::ParquetArrayEncoder::isLastArrayEmpty ( ) const
inlineprotected
bool foreign_storage::ParquetArrayEncoder::isLastArrayNull ( ) const
inlineprotected
bool foreign_storage::ParquetArrayEncoder::isNewArray ( const int16_t  rep_level) const
inlineprotected

Definition at line 151 of file ParquetArrayEncoder.h.

References has_assembly_started_.

Referenced by appendData().

151  {
152  return rep_level == 0 && has_assembly_started_;
153  }

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::markArrayAsEmpty ( )
inlineprivate

Definition at line 197 of file ParquetArrayEncoder.h.

References is_empty_array_.

Referenced by processArrayItem().

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::markArrayAsNull ( )
inlineprivate

Definition at line 195 of file ParquetArrayEncoder.h.

References is_null_array_.

Referenced by processArrayItem().

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::processArrayItem ( const int16_t  def_level,
int64_t &  encoded_index 
)
inlineprivate

Definition at line 178 of file ParquetArrayEncoder.h.

References appendArrayItem(), appendNullArrayItem(), empty_list_def_level, has_assembly_started_, item_null_def_level, list_null_def_level, markArrayAsEmpty(), markArrayAsNull(), non_null_def_level, and UNREACHABLE.

Referenced by appendData().

178  {
179  has_assembly_started_ = true;
180  if (def_level == non_null_def_level) {
181  // push back a scalar element to in-memory data buffer
182  appendArrayItem(encoded_index++);
183  } else if (def_level == item_null_def_level) {
184  // push back a scalar null to in-memory data buffer
186  } else if (def_level == list_null_def_level) {
187  markArrayAsNull();
188  } else if (def_level == empty_list_def_level) {
190  } else {
191  UNREACHABLE();
192  }
193  }
virtual void appendArrayItem(const int64_t encoded_index)
#define UNREACHABLE()
Definition: Logger.h:337

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::processLastArray ( )
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayImportEncoder, foreign_storage::ParquetArrayDetectEncoder, foreign_storage::ParquetVariableLengthArrayEncoder, and foreign_storage::ParquetFixedLengthArrayEncoder.

Definition at line 106 of file ParquetArrayEncoder.h.

References foreign_storage::ParquetEncoder::column_type_, SQLTypeInfo::get_notnull(), foreign_storage::ParquetEncoder::invalid_indices_, foreign_storage::ParquetEncoder::is_error_tracking_enabled_, is_invalid_array_, isLastArrayNull(), and num_array_assembled_.

Referenced by appendData(), finalizeRowGroup(), foreign_storage::ParquetFixedLengthArrayEncoder::processLastArray(), foreign_storage::ParquetVariableLengthArrayEncoder::processLastArray(), and foreign_storage::ParquetArrayDetectEncoder::processLastArray().

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::resetLastArrayMetadata ( )
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayImportEncoder.

Definition at line 142 of file ParquetArrayEncoder.h.

References is_empty_array_, foreign_storage::ParquetEncoder::is_error_tracking_enabled_, is_invalid_array_, is_null_array_, and num_elements_in_array_.

Referenced by appendData(), finalizeRowGroup(), and foreign_storage::ParquetArrayImportEncoder::resetLastArrayMetadata().

+ Here is the caller graph for this function:

int8_t* foreign_storage::ParquetArrayEncoder::resizeArrayDataBytes ( const size_t  additional_num_elements)
inlineprotected

Definition at line 125 of file ParquetArrayEncoder.h.

References data_buffer_bytes_, and omnisci_data_type_byte_size_.

Referenced by appendArrayItem(), appendNullArrayItem(), and foreign_storage::ParquetFixedLengthArrayEncoder::appendNullFixedLengthArray().

125  {
126  auto current_data_byte_size = data_buffer_bytes_.size();
127  data_buffer_bytes_.resize(current_data_byte_size +
128  additional_num_elements * omnisci_data_type_byte_size_);
129  return data_buffer_bytes_.data() + current_data_byte_size;
130  }

+ Here is the caller graph for this function:

size_t foreign_storage::ParquetArrayEncoder::sizeOfLastArray ( ) const
inlineprotected

Definition at line 123 of file ParquetArrayEncoder.h.

References num_elements_in_array_.

Referenced by foreign_storage::ParquetFixedLengthArrayEncoder::appendNullArrayOrCheckArraySize(), and foreign_storage::ParquetArrayImportEncoder::appendToArrayDatumBuffer().

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::updateMetadataForAppendedArrayItem ( const int64_t  encoded_index)
inlineprotected

Definition at line 159 of file ParquetArrayEncoder.h.

References foreign_storage::ParquetEncoder::is_error_tracking_enabled_, is_invalid_array_, is_valid_item_, and num_elements_in_array_.

Referenced by foreign_storage::ParquetArrayDetectEncoder::appendArrayItem(), and appendArrayItem().

+ Here is the caller graph for this function:

Member Data Documentation

const int16_t foreign_storage::ParquetArrayEncoder::empty_list_def_level = 1
staticprotected
std::vector<int8_t> foreign_storage::ParquetArrayEncoder::encode_buffer_
private

Definition at line 204 of file ParquetArrayEncoder.h.

Referenced by encodeAllValues(), and encodedDataAtIndex().

bool foreign_storage::ParquetArrayEncoder::has_assembly_started_
private

Definition at line 205 of file ParquetArrayEncoder.h.

Referenced by finalizeRowGroup(), isNewArray(), and processArrayItem().

bool foreign_storage::ParquetArrayEncoder::is_empty_array_
private
bool foreign_storage::ParquetArrayEncoder::is_invalid_array_
private
bool foreign_storage::ParquetArrayEncoder::is_null_array_
private

Definition at line 206 of file ParquetArrayEncoder.h.

Referenced by isLastArrayNull(), markArrayAsNull(), and resetLastArrayMetadata().

std::vector<bool> foreign_storage::ParquetArrayEncoder::is_valid_item_
private
const int16_t foreign_storage::ParquetArrayEncoder::item_null_def_level = 2
staticprotected

Definition at line 138 of file ParquetArrayEncoder.h.

Referenced by processArrayItem().

const int16_t foreign_storage::ParquetArrayEncoder::list_null_def_level = 0
staticprotected
const int16_t foreign_storage::ParquetArrayEncoder::non_null_def_level = 3
staticprotected

Definition at line 137 of file ParquetArrayEncoder.h.

Referenced by processArrayItem().

size_t foreign_storage::ParquetArrayEncoder::num_array_assembled_
private

Definition at line 211 of file ParquetArrayEncoder.h.

Referenced by processLastArray().

size_t foreign_storage::ParquetArrayEncoder::num_elements_in_array_
private

The documentation for this class was generated from the following file: