OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::ParquetArrayEncoder Class Reference

#include <ParquetArrayEncoder.h>

+ Inheritance diagram for foreign_storage::ParquetArrayEncoder:
+ Collaboration diagram for foreign_storage::ParquetArrayEncoder:

Public Member Functions

 ParquetArrayEncoder (Data_Namespace::AbstractBuffer *data_buffer, std::shared_ptr< ParquetScalarEncoder > scalar_encoder, const ColumnDescriptor *column_desciptor)
 
void appendDataTrackErrors (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void appendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void finalizeRowGroup ()
 
std::shared_ptr< ChunkMetadatagetRowGroupMetadata (const parquet::RowGroupMetaData *group_metadata, const int parquet_column_index, const SQLTypeInfo &column_type) override
 
virtual void disableMetadataStatsValidation () override
 
virtual void initializeErrorTracking () override
 
virtual void initializeColumnType (const SQLTypeInfo &column_type) override
 
- Public Member Functions inherited from foreign_storage::ParquetEncoder
 ParquetEncoder (Data_Namespace::AbstractBuffer *buffer)
 
virtual ~ParquetEncoder ()=default
 
RejectedRowIndices getRejectedRowIndices () const
 

Protected Member Functions

virtual void processLastArray ()
 
virtual void appendArraysToBuffer ()
 
bool isLastArrayNull () const
 
bool isLastArrayEmpty () const
 
size_t sizeOfLastArray () const
 
int8_t * resizeArrayDataBytes (const size_t additional_num_elements)
 
virtual void resetLastArrayMetadata ()
 
bool isNewArray (const int16_t rep_level) const
 
int8_t * encodedDataAtIndex (const size_t index)
 
void updateMetadataForAppendedArrayItem (const int64_t encoded_index)
 
virtual void appendArrayItem (const int64_t encoded_index)
 
virtual void encodeAllValues (const int8_t *values, const int64_t values_read)
 

Protected Attributes

size_t omnisci_data_type_byte_size_
 
std::shared_ptr
< ParquetScalarEncoder
scalar_encoder_
 
std::vector< int8_t > data_buffer_bytes_
 
- Protected Attributes inherited from foreign_storage::ParquetEncoder
Data_Namespace::AbstractBufferbuffer_
 
bool is_error_tracking_enabled_
 
RejectedRowIndices invalid_indices_
 
size_t current_chunk_offset_
 
SQLTypeInfo column_type_
 
bool validate_metadata_stats_
 

Static Protected Attributes

static const int16_t non_null_def_level = 3
 
static const int16_t item_null_def_level = 2
 
static const int16_t empty_list_def_level = 1
 
static const int16_t list_null_def_level = 0
 

Private Member Functions

void processArrayItem (const int16_t def_level, int64_t &encoded_index)
 
void markArrayAsNull ()
 
void markArrayAsEmpty ()
 
void appendNullArrayItem ()
 

Private Attributes

std::vector< int8_t > encode_buffer_
 
bool has_assembly_started_
 
bool is_null_array_
 
bool is_empty_array_
 
size_t num_elements_in_array_
 
size_t num_array_assembled_
 
bool is_invalid_array_
 
std::vector< bool > is_valid_item_
 

Additional Inherited Members

- Static Protected Member Functions inherited from foreign_storage::ParquetEncoder
static std::shared_ptr
< ChunkMetadata
createMetadata (const SQLTypeInfo &column_type)
 
static void throwNotNullViolation (const std::string &parquet_column_name)
 
static void validateNullCount (const std::string &parquet_column_name, int64_t null_count, const SQLTypeInfo &column_type)
 

Detailed Description

Definition at line 23 of file ParquetArrayEncoder.h.

Constructor & Destructor Documentation

foreign_storage::ParquetArrayEncoder::ParquetArrayEncoder ( Data_Namespace::AbstractBuffer data_buffer,
std::shared_ptr< ParquetScalarEncoder scalar_encoder,
const ColumnDescriptor column_desciptor 
)
inline

Definition at line 25 of file ParquetArrayEncoder.h.

28  : ParquetEncoder(data_buffer)
30  column_desciptor->columnType.get_elem_type().get_size())
31  , scalar_encoder_(scalar_encoder)
32  , has_assembly_started_(false)
33  , is_null_array_(false)
34  , is_empty_array_(false)
37  , is_invalid_array_(false) {}
ParquetEncoder(Data_Namespace::AbstractBuffer *buffer)
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
SQLTypeInfo columnType
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:975
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

Member Function Documentation

virtual void foreign_storage::ParquetArrayEncoder::appendArrayItem ( const int64_t  encoded_index)
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayImportEncoder, and foreign_storage::ParquetArrayDetectEncoder.

Definition at line 171 of file ParquetArrayEncoder.h.

References encodedDataAtIndex(), resizeArrayDataBytes(), scalar_encoder_, and updateMetadataForAppendedArrayItem().

Referenced by foreign_storage::ParquetArrayImportEncoder::appendArrayItem(), and processArrayItem().

171  {
172  auto omnisci_data_ptr = resizeArrayDataBytes(1);
173  scalar_encoder_->copy(encodedDataAtIndex(encoded_index), omnisci_data_ptr);
174  updateMetadataForAppendedArrayItem(encoded_index);
175  }
int8_t * resizeArrayDataBytes(const size_t additional_num_elements)
int8_t * encodedDataAtIndex(const size_t index)
void updateMetadataForAppendedArrayItem(const int64_t encoded_index)
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::appendArraysToBuffer ( )
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayImportEncoder, foreign_storage::ParquetArrayDetectEncoder, and foreign_storage::ParquetVariableLengthArrayEncoder.

Definition at line 119 of file ParquetArrayEncoder.h.

References Data_Namespace::AbstractBuffer::append(), foreign_storage::ParquetEncoder::buffer_, and data_buffer_bytes_.

Referenced by foreign_storage::ParquetVariableLengthArrayEncoder::appendArraysToBuffer(), and finalizeRowGroup().

119  {
121  data_buffer_bytes_.clear();
122  }
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
Data_Namespace::AbstractBuffer * buffer_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::appendData ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values 
)
inlineoverridevirtual

Implements foreign_storage::ParquetEncoder.

Reimplemented in foreign_storage::ParquetVariableLengthArrayEncoder.

Definition at line 57 of file ParquetArrayEncoder.h.

References CHECK, encodeAllValues(), isNewArray(), processArrayItem(), processLastArray(), and resetLastArrayMetadata().

Referenced by foreign_storage::ParquetVariableLengthArrayEncoder::appendData(), appendDataTrackErrors(), and foreign_storage::ParquetArrayImportEncoder::validateAndAppendData().

61  {
62  CHECK(levels_read > 0);
63 
64  // encode all values in the temporary in-memory `encode_buffer_`, doing
65  // this encoding as a batch rather than element-wise exposes opportunities
66  // for performance optimization for certain scalar types
67  encodeAllValues(values, values_read);
68 
69  for (int64_t i = 0, j = 0; i < levels_read; ++i) {
70  if (isNewArray(rep_levels[i])) {
73  }
74  processArrayItem(def_levels[i], j);
75  }
76  }
void processArrayItem(const int16_t def_level, int64_t &encoded_index)
virtual void encodeAllValues(const int8_t *values, const int64_t values_read)
#define CHECK(condition)
Definition: Logger.h:291
bool isNewArray(const int16_t rep_level) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::appendDataTrackErrors ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values 
)
inlineoverridevirtual

Implements foreign_storage::ParquetEncoder.

Definition at line 39 of file ParquetArrayEncoder.h.

References appendData(), CHECK, foreign_storage::ParquetEncoder::is_error_tracking_enabled_, is_valid_item_, and scalar_encoder_.

43  {
45  // validate all elements
46  is_valid_item_.assign(values_read, true);
47  for (int64_t j = 0; j < values_read; ++j) {
48  try {
49  scalar_encoder_->validateUsingEncodersColumnType(values, j);
50  } catch (const std::runtime_error& error) {
51  is_valid_item_[j] = false;
52  }
53  }
54  appendData(def_levels, rep_levels, values_read, levels_read, values);
55  }
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
#define CHECK(condition)
Definition: Logger.h:291
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

void foreign_storage::ParquetArrayEncoder::appendNullArrayItem ( )
inlineprivate

Definition at line 204 of file ParquetArrayEncoder.h.

References num_elements_in_array_, resizeArrayDataBytes(), and scalar_encoder_.

Referenced by processArrayItem().

204  {
207  }
int8_t * resizeArrayDataBytes(const size_t additional_num_elements)
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::disableMetadataStatsValidation ( )
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Definition at line 95 of file ParquetArrayEncoder.h.

References foreign_storage::ParquetEncoder::disableMetadataStatsValidation(), and scalar_encoder_.

95  {
97  scalar_encoder_->disableMetadataStatsValidation();
98  }
virtual void disableMetadataStatsValidation()
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::encodeAllValues ( const int8_t *  values,
const int64_t  values_read 
)
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayDetectEncoder.

Definition at line 177 of file ParquetArrayEncoder.h.

References encode_buffer_, omnisci_data_type_byte_size_, and scalar_encoder_.

Referenced by appendData(), and foreign_storage::ParquetArrayDetectEncoder::encodeAllValues().

177  {
178  encode_buffer_.resize(values_read * omnisci_data_type_byte_size_);
179  scalar_encoder_->encodeAndCopyContiguous(values, encode_buffer_.data(), values_read);
180  }
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the caller graph for this function:

int8_t* foreign_storage::ParquetArrayEncoder::encodedDataAtIndex ( const size_t  index)
inlineprotected

Definition at line 160 of file ParquetArrayEncoder.h.

References encode_buffer_, and omnisci_data_type_byte_size_.

Referenced by foreign_storage::ParquetArrayDetectEncoder::appendArrayItem(), and appendArrayItem().

160  {
161  return encode_buffer_.data() + (index)*omnisci_data_type_byte_size_;
162  }

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::finalizeRowGroup ( )
inline

Definition at line 78 of file ParquetArrayEncoder.h.

References appendArraysToBuffer(), has_assembly_started_, processLastArray(), and resetLastArrayMetadata().

+ Here is the call graph for this function:

std::shared_ptr<ChunkMetadata> foreign_storage::ParquetArrayEncoder::getRowGroupMetadata ( const parquet::RowGroupMetaData *  group_metadata,
const int  parquet_column_index,
const SQLTypeInfo column_type 
)
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Reimplemented in foreign_storage::ParquetFixedLengthArrayEncoder.

Definition at line 85 of file ParquetArrayEncoder.h.

References scalar_encoder_.

Referenced by foreign_storage::ParquetFixedLengthArrayEncoder::getRowGroupMetadata().

88  {
89  auto metadata = scalar_encoder_->getRowGroupMetadata(
90  group_metadata, parquet_column_index, column_type);
91  metadata->numBytes = 0; // number of bytes is not known
92  return metadata;
93  }
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::initializeColumnType ( const SQLTypeInfo column_type)
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Definition at line 105 of file ParquetArrayEncoder.h.

References SQLTypeInfo::get_elem_type(), foreign_storage::ParquetEncoder::initializeColumnType(), and scalar_encoder_.

105  {
107  scalar_encoder_->initializeColumnType(column_type.get_elem_type());
108  }
virtual void initializeColumnType(const SQLTypeInfo &column_type)
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:975
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::initializeErrorTracking ( )
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Definition at line 100 of file ParquetArrayEncoder.h.

References foreign_storage::ParquetEncoder::initializeErrorTracking(), and scalar_encoder_.

100  {
102  scalar_encoder_->initializeErrorTracking();
103  }
virtual void initializeErrorTracking()
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_

+ Here is the call graph for this function:

bool foreign_storage::ParquetArrayEncoder::isLastArrayEmpty ( ) const
inlineprotected
bool foreign_storage::ParquetArrayEncoder::isLastArrayNull ( ) const
inlineprotected
bool foreign_storage::ParquetArrayEncoder::isNewArray ( const int16_t  rep_level) const
inlineprotected

Definition at line 156 of file ParquetArrayEncoder.h.

References has_assembly_started_.

Referenced by appendData().

156  {
157  return rep_level == 0 && has_assembly_started_;
158  }

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::markArrayAsEmpty ( )
inlineprivate

Definition at line 202 of file ParquetArrayEncoder.h.

References is_empty_array_.

Referenced by processArrayItem().

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::markArrayAsNull ( )
inlineprivate

Definition at line 200 of file ParquetArrayEncoder.h.

References is_null_array_.

Referenced by processArrayItem().

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::processArrayItem ( const int16_t  def_level,
int64_t &  encoded_index 
)
inlineprivate

Definition at line 183 of file ParquetArrayEncoder.h.

References appendArrayItem(), appendNullArrayItem(), empty_list_def_level, has_assembly_started_, item_null_def_level, list_null_def_level, markArrayAsEmpty(), markArrayAsNull(), non_null_def_level, and UNREACHABLE.

Referenced by appendData().

183  {
184  has_assembly_started_ = true;
185  if (def_level == non_null_def_level) {
186  // push back a scalar element to in-memory data buffer
187  appendArrayItem(encoded_index++);
188  } else if (def_level == item_null_def_level) {
189  // push back a scalar null to in-memory data buffer
191  } else if (def_level == list_null_def_level) {
192  markArrayAsNull();
193  } else if (def_level == empty_list_def_level) {
195  } else {
196  UNREACHABLE();
197  }
198  }
virtual void appendArrayItem(const int64_t encoded_index)
#define UNREACHABLE()
Definition: Logger.h:338

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::processLastArray ( )
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayImportEncoder, foreign_storage::ParquetArrayDetectEncoder, foreign_storage::ParquetVariableLengthArrayEncoder, and foreign_storage::ParquetFixedLengthArrayEncoder.

Definition at line 111 of file ParquetArrayEncoder.h.

References foreign_storage::ParquetEncoder::column_type_, SQLTypeInfo::get_notnull(), foreign_storage::ParquetEncoder::invalid_indices_, foreign_storage::ParquetEncoder::is_error_tracking_enabled_, is_invalid_array_, isLastArrayNull(), and num_array_assembled_.

Referenced by appendData(), finalizeRowGroup(), foreign_storage::ParquetFixedLengthArrayEncoder::processLastArray(), foreign_storage::ParquetVariableLengthArrayEncoder::processLastArray(), and foreign_storage::ParquetArrayDetectEncoder::processLastArray().

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual void foreign_storage::ParquetArrayEncoder::resetLastArrayMetadata ( )
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetArrayImportEncoder.

Definition at line 147 of file ParquetArrayEncoder.h.

References is_empty_array_, foreign_storage::ParquetEncoder::is_error_tracking_enabled_, is_invalid_array_, is_null_array_, and num_elements_in_array_.

Referenced by appendData(), finalizeRowGroup(), and foreign_storage::ParquetArrayImportEncoder::resetLastArrayMetadata().

+ Here is the caller graph for this function:

int8_t* foreign_storage::ParquetArrayEncoder::resizeArrayDataBytes ( const size_t  additional_num_elements)
inlineprotected

Definition at line 130 of file ParquetArrayEncoder.h.

References data_buffer_bytes_, and omnisci_data_type_byte_size_.

Referenced by appendArrayItem(), appendNullArrayItem(), and foreign_storage::ParquetFixedLengthArrayEncoder::appendNullFixedLengthArray().

130  {
131  auto current_data_byte_size = data_buffer_bytes_.size();
132  data_buffer_bytes_.resize(current_data_byte_size +
133  additional_num_elements * omnisci_data_type_byte_size_);
134  return data_buffer_bytes_.data() + current_data_byte_size;
135  }

+ Here is the caller graph for this function:

size_t foreign_storage::ParquetArrayEncoder::sizeOfLastArray ( ) const
inlineprotected

Definition at line 128 of file ParquetArrayEncoder.h.

References num_elements_in_array_.

Referenced by foreign_storage::ParquetFixedLengthArrayEncoder::appendNullArrayOrCheckArraySize(), and foreign_storage::ParquetArrayImportEncoder::appendToArrayDatumBuffer().

+ Here is the caller graph for this function:

void foreign_storage::ParquetArrayEncoder::updateMetadataForAppendedArrayItem ( const int64_t  encoded_index)
inlineprotected

Definition at line 164 of file ParquetArrayEncoder.h.

References foreign_storage::ParquetEncoder::is_error_tracking_enabled_, is_invalid_array_, is_valid_item_, and num_elements_in_array_.

Referenced by foreign_storage::ParquetArrayDetectEncoder::appendArrayItem(), and appendArrayItem().

+ Here is the caller graph for this function:

Member Data Documentation

const int16_t foreign_storage::ParquetArrayEncoder::empty_list_def_level = 1
staticprotected
std::vector<int8_t> foreign_storage::ParquetArrayEncoder::encode_buffer_
private

Definition at line 209 of file ParquetArrayEncoder.h.

Referenced by encodeAllValues(), and encodedDataAtIndex().

bool foreign_storage::ParquetArrayEncoder::has_assembly_started_
private

Definition at line 210 of file ParquetArrayEncoder.h.

Referenced by finalizeRowGroup(), isNewArray(), and processArrayItem().

bool foreign_storage::ParquetArrayEncoder::is_empty_array_
private
bool foreign_storage::ParquetArrayEncoder::is_invalid_array_
private
bool foreign_storage::ParquetArrayEncoder::is_null_array_
private

Definition at line 211 of file ParquetArrayEncoder.h.

Referenced by isLastArrayNull(), markArrayAsNull(), and resetLastArrayMetadata().

std::vector<bool> foreign_storage::ParquetArrayEncoder::is_valid_item_
private
const int16_t foreign_storage::ParquetArrayEncoder::item_null_def_level = 2
staticprotected

Definition at line 143 of file ParquetArrayEncoder.h.

Referenced by processArrayItem().

const int16_t foreign_storage::ParquetArrayEncoder::list_null_def_level = 0
staticprotected
const int16_t foreign_storage::ParquetArrayEncoder::non_null_def_level = 3
staticprotected

Definition at line 142 of file ParquetArrayEncoder.h.

Referenced by processArrayItem().

size_t foreign_storage::ParquetArrayEncoder::num_array_assembled_
private

Definition at line 216 of file ParquetArrayEncoder.h.

Referenced by processLastArray().

size_t foreign_storage::ParquetArrayEncoder::num_elements_in_array_
private

The documentation for this class was generated from the following file: