OmniSciDB  fe05a0c208
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
foreign_storage::TypedParquetInPlaceEncoder< V, T > Class Template Reference

#include <ParquetInPlaceEncoder.h>

+ Inheritance diagram for foreign_storage::TypedParquetInPlaceEncoder< V, T >:
+ Collaboration diagram for foreign_storage::TypedParquetInPlaceEncoder< V, T >:

Public Member Functions

 TypedParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
 TypedParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
 
void appendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
 
void encodeAndCopyContiguous (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
 
void setNull (int8_t *omnisci_data_bytes) override
 
void copy (const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination) override
 
std::shared_ptr< ChunkMetadatagetRowGroupMetadata (const parquet::RowGroupMetaData *group_metadata, const int parquet_column_index, const SQLTypeInfo &column_type) override
 
- Public Member Functions inherited from foreign_storage::ParquetInPlaceEncoder
 ParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
 
- Public Member Functions inherited from foreign_storage::ParquetScalarEncoder
 ParquetScalarEncoder (Data_Namespace::AbstractBuffer *buffer)
 
virtual void encodeAndCopy (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0
 
- Public Member Functions inherited from foreign_storage::ParquetEncoder
 ParquetEncoder (Data_Namespace::AbstractBuffer *buffer)
 
virtual ~ParquetEncoder ()=default
 

Protected Member Functions

virtual bool encodingIsIdentityForSameTypes () const
 
std::pair< T, T > getUnencodedStats (std::shared_ptr< parquet::Statistics > stats) const
 

Private Member Functions

std::pair< V, V > getEncodedStats (const parquet::ColumnDescriptor *parquet_column_descriptor, std::shared_ptr< parquet::Statistics > stats)
 

Static Private Member Functions

static ChunkStats getUpdatedStats (V &stats_min, V &stats_max, const SQLTypeInfo &column_type)
 

Additional Inherited Members

- Static Protected Member Functions inherited from foreign_storage::ParquetEncoder
static std::shared_ptr
< ChunkMetadata
createMetadata (const SQLTypeInfo &column_type)
 
static void throwNotNullViolation (const std::string &parquet_column_name)
 
static void validateNullCount (const std::string &parquet_column_name, int64_t null_count, const SQLTypeInfo &column_type)
 
- Protected Attributes inherited from foreign_storage::ParquetInPlaceEncoder
const size_t omnisci_data_type_byte_size_
 
- Protected Attributes inherited from foreign_storage::ParquetEncoder
Data_Namespace::AbstractBufferbuffer_
 

Detailed Description

template<typename V, typename T>
class foreign_storage::TypedParquetInPlaceEncoder< V, T >

Definition at line 114 of file ParquetInPlaceEncoder.h.

Constructor & Destructor Documentation

template<typename V, typename T>
foreign_storage::TypedParquetInPlaceEncoder< V, T >::TypedParquetInPlaceEncoder ( Data_Namespace::AbstractBuffer buffer,
const ColumnDescriptor column_desciptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)
inline

Definition at line 116 of file ParquetInPlaceEncoder.h.

120  buffer,
121  column_desciptor->columnType.get_size(),
122  parquet::GetTypeByteSize(parquet_column_descriptor->physical_type())) {}
ParquetInPlaceEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
HOST DEVICE int get_size() const
Definition: sqltypes.h:324
SQLTypeInfo columnType
template<typename V, typename T>
foreign_storage::TypedParquetInPlaceEncoder< V, T >::TypedParquetInPlaceEncoder ( Data_Namespace::AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size 
)
inline

Definition at line 124 of file ParquetInPlaceEncoder.h.

127  : ParquetInPlaceEncoder(buffer,
128  omnisci_data_type_byte_size,
129  parquet_data_type_byte_size) {}
ParquetInPlaceEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)

Member Function Documentation

template<typename V, typename T>
void foreign_storage::TypedParquetInPlaceEncoder< V, T >::appendData ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
const bool  is_last_batch,
int8_t *  values 
)
inlineoverridevirtual

This is a specialization of ParquetInPlaceEncoder::appendData for known types that allows for optimization.

See comment for ParquetInPlaceEncoder::appendData for details.

Reimplemented from foreign_storage::ParquetInPlaceEncoder.

Definition at line 137 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::ParquetStringEncoder< V >::appendData().

142  {
143  if (std::is_same<V, T>::value && values_read == levels_read) {
145  for (int64_t i = 0; i < levels_read; ++i) {
147  values + i * omnisci_data_type_byte_size_);
148  }
149  }
150  buffer_->append(values, levels_read * omnisci_data_type_byte_size_);
151  } else {
153  def_levels, rep_levels, values_read, levels_read, is_last_batch, values);
154  }
155  }
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, const bool is_last_batch, int8_t *values) override
virtual void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
Data_Namespace::AbstractBuffer * buffer_

+ Here is the caller graph for this function:

template<typename V, typename T>
void foreign_storage::TypedParquetInPlaceEncoder< V, T >::copy ( const int8_t *  omnisci_data_bytes_source,
int8_t *  omnisci_data_bytes_destination 
)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 173 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::ParquetStringEncoder< V >::encodeAndCopy().

174  {
175  const auto& omnisci_data_value_source =
176  reinterpret_cast<const V*>(omnisci_data_bytes_source)[0];
177  auto& omnisci_data_value_destination =
178  reinterpret_cast<V*>(omnisci_data_bytes_destination)[0];
179  omnisci_data_value_destination = omnisci_data_value_source;
180  }

+ Here is the caller graph for this function:

template<typename V, typename T>
void foreign_storage::TypedParquetInPlaceEncoder< V, T >::encodeAndCopyContiguous ( const int8_t *  parquet_data_bytes,
int8_t *  omnisci_data_bytes,
const size_t  num_elements 
)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 157 of file ParquetInPlaceEncoder.h.

159  {
160  auto parquet_data_ptr = reinterpret_cast<const T*>(parquet_data_bytes);
161  auto omnisci_data_ptr = reinterpret_cast<V*>(omnisci_data_bytes);
162  for (size_t i = 0; i < num_elements; ++i) {
163  encodeAndCopy(reinterpret_cast<const int8_t*>(&parquet_data_ptr[i]),
164  reinterpret_cast<int8_t*>(&omnisci_data_ptr[i]));
165  }
166  }
virtual void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0
template<typename V, typename T>
virtual bool foreign_storage::TypedParquetInPlaceEncoder< V, T >::encodingIsIdentityForSameTypes ( ) const
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetStringEncoder< V >, foreign_storage::ParquetFixedLengthEncoder< V, T >, and foreign_storage::ParquetDecimalEncoder< V, T >.

Definition at line 228 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::TypedParquetInPlaceEncoder< int64_t, int32_t >::appendData().

228 { return false; }

+ Here is the caller graph for this function:

template<typename V, typename T>
std::pair<V, V> foreign_storage::TypedParquetInPlaceEncoder< V, T >::getEncodedStats ( const parquet::ColumnDescriptor *  parquet_column_descriptor,
std::shared_ptr< parquet::Statistics >  stats 
)
inlineprivate

Definition at line 260 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::TypedParquetInPlaceEncoder< int64_t, int32_t >::getRowGroupMetadata().

262  {
263  V stats_min, stats_max;
264  auto min_string = stats->EncodeMin();
265  auto max_string = stats->EncodeMax();
266  if (parquet_column_descriptor->physical_type() ==
267  parquet::Type::FIXED_LEN_BYTE_ARRAY) {
268  parquet::FixedLenByteArray min_byte_array, max_byte_array;
269  min_byte_array.ptr = reinterpret_cast<const uint8_t*>(min_string.data());
270  max_byte_array.ptr = reinterpret_cast<const uint8_t*>(max_string.data());
271  encodeAndCopy(reinterpret_cast<int8_t*>(&min_byte_array),
272  reinterpret_cast<int8_t*>(&stats_min));
273  encodeAndCopy(reinterpret_cast<int8_t*>(&max_byte_array),
274  reinterpret_cast<int8_t*>(&stats_max));
275  } else if (parquet_column_descriptor->physical_type() == parquet::Type::BYTE_ARRAY) {
276  parquet::ByteArray min_byte_array, max_byte_array;
277  min_byte_array.ptr = reinterpret_cast<const uint8_t*>(min_string.data());
278  min_byte_array.len = min_string.length();
279  max_byte_array.ptr = reinterpret_cast<const uint8_t*>(max_string.data());
280  max_byte_array.len = max_string.length();
281  encodeAndCopy(reinterpret_cast<int8_t*>(&min_byte_array),
282  reinterpret_cast<int8_t*>(&stats_min));
283  encodeAndCopy(reinterpret_cast<int8_t*>(&max_byte_array),
284  reinterpret_cast<int8_t*>(&stats_max));
285  } else {
286  encodeAndCopy(reinterpret_cast<int8_t*>(min_string.data()),
287  reinterpret_cast<int8_t*>(&stats_min));
288  encodeAndCopy(reinterpret_cast<int8_t*>(max_string.data()),
289  reinterpret_cast<int8_t*>(&stats_max));
290  }
291  return {stats_min, stats_max};
292  }
virtual void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0

+ Here is the caller graph for this function:

template<typename V, typename T>
std::shared_ptr<ChunkMetadata> foreign_storage::TypedParquetInPlaceEncoder< V, T >::getRowGroupMetadata ( const parquet::RowGroupMetaData *  group_metadata,
const int  parquet_column_index,
const SQLTypeInfo column_type 
)
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Definition at line 182 of file ParquetInPlaceEncoder.h.

185  {
186  auto metadata = ParquetEncoder::createMetadata(column_type);
187  auto column_metadata = group_metadata->ColumnChunk(parquet_column_index);
188 
189  // update statistics
190  auto parquet_column_descriptor =
191  group_metadata->schema()->Column(parquet_column_index);
192  auto stats = validate_and_get_column_metadata_statistics(column_metadata.get());
193  if (stats->HasMinMax()) {
194  // validate statistics if validation applicable as part of encoding
195  if (auto parquet_scalar_validator = dynamic_cast<ParquetMetadataValidator*>(this)) {
196  try {
197  parquet_scalar_validator->validate(
198  stats, column_type.is_array() ? column_type.get_elem_type() : column_type);
199  } catch (const std::exception& e) {
200  std::stringstream error_message;
201  error_message << e.what() << " Error validating statistics of Parquet column '"
202  << group_metadata->schema()->Column(parquet_column_index)->name()
203  << "'";
204  throw std::runtime_error(error_message.str());
205  }
206  }
207 
208  auto [stats_min, stats_max] = getEncodedStats(parquet_column_descriptor, stats);
209  auto updated_chunk_stats = getUpdatedStats(stats_min, stats_max, column_type);
210  metadata->fillChunkStats(updated_chunk_stats.min,
211  updated_chunk_stats.max,
212  metadata->chunkStats.has_nulls);
213  }
214  auto null_count = stats->null_count();
215  validateNullCount(group_metadata->schema()->Column(parquet_column_index)->name(),
216  null_count,
217  column_type);
218  metadata->chunkStats.has_nulls = null_count > 0;
219 
220  // update sizing
221  metadata->numBytes = omnisci_data_type_byte_size_ * column_metadata->num_values();
222  metadata->numElements = group_metadata->num_rows();
223 
224  return metadata;
225  }
static ChunkStats getUpdatedStats(V &stats_min, V &stats_max, const SQLTypeInfo &column_type)
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
std::pair< V, V > getEncodedStats(const parquet::ColumnDescriptor *parquet_column_descriptor, std::shared_ptr< parquet::Statistics > stats)
static void validateNullCount(const std::string &parquet_column_name, int64_t null_count, const SQLTypeInfo &column_type)
static std::shared_ptr< ChunkMetadata > createMetadata(const SQLTypeInfo &column_type)
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:713
bool is_array() const
Definition: sqltypes.h:497
template<typename V, typename T>
std::pair<T, T> foreign_storage::TypedParquetInPlaceEncoder< V, T >::getUnencodedStats ( std::shared_ptr< parquet::Statistics >  stats) const
inlineprotected

Definition at line 230 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::ParquetDateInSecondsEncoder::validate(), foreign_storage::ParquetTimestampEncoder< V, T, conversion_denominator >::validate(), foreign_storage::ParquetUnsignedFixedLengthEncoder< V, T, U >::validate(), and foreign_storage::ParquetFixedLengthEncoder< V, T >::validateIntegralOrFloatingPointValue().

230  {
231  T stats_min = reinterpret_cast<T*>(stats->EncodeMin().data())[0];
232  T stats_max = reinterpret_cast<T*>(stats->EncodeMax().data())[0];
233  return {stats_min, stats_max};
234  }

+ Here is the caller graph for this function:

template<typename V, typename T>
static ChunkStats foreign_storage::TypedParquetInPlaceEncoder< V, T >::getUpdatedStats ( V &  stats_min,
V &  stats_max,
const SQLTypeInfo column_type 
)
inlinestaticprivate

Definition at line 237 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::TypedParquetInPlaceEncoder< int64_t, int32_t >::getRowGroupMetadata().

239  {
240  ForeignStorageBuffer buffer;
241  buffer.initEncoder(column_type);
242  auto encoder = buffer.getEncoder();
243 
244  if (column_type.is_array()) {
245  ArrayDatum min_datum(
246  sizeof(V), reinterpret_cast<int8_t*>(&stats_min), false, DoNothingDeleter());
247  ArrayDatum max_datum(
248  sizeof(V), reinterpret_cast<int8_t*>(&stats_max), false, DoNothingDeleter());
249  std::vector<ArrayDatum> min_max_datums{min_datum, max_datum};
250  encoder->updateStats(&min_max_datums, 0, 1);
251  } else {
252  encoder->updateStats(reinterpret_cast<int8_t*>(&stats_min), 1);
253  encoder->updateStats(reinterpret_cast<int8_t*>(&stats_max), 1);
254  }
255  auto updated_chunk_stats_metadata = std::make_shared<ChunkMetadata>();
256  encoder->getMetadata(updated_chunk_stats_metadata);
257  return updated_chunk_stats_metadata->chunkStats;
258  }
void initEncoder(const SQLTypeInfo &tmp_sql_type)
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:202
bool is_array() const
Definition: sqltypes.h:497

+ Here is the caller graph for this function:

template<typename V, typename T>
void foreign_storage::TypedParquetInPlaceEncoder< V, T >::setNull ( int8_t *  omnisci_data_bytes)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 168 of file ParquetInPlaceEncoder.h.

168  {
169  auto& omnisci_data_value = reinterpret_cast<V*>(omnisci_data_bytes)[0];
170  omnisci_data_value = get_null_value<V>();
171  }

The documentation for this class was generated from the following file: