OmniSciDB  471d68cefb
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType > Class Template Reference

#include <ParquetInPlaceEncoder.h>

+ Inheritance diagram for foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >:
+ Collaboration diagram for foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >:

Public Member Functions

 TypedParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
 TypedParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
 
void validate (const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const override
 
void validateAndAppendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values, const SQLTypeInfo &column_type, InvalidRowGroupIndices &invalid_indices) override
 
void eraseInvalidIndicesInBuffer (const InvalidRowGroupIndices &invalid_indices) override
 
void appendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void encodeAndCopyContiguous (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
 
void setNull (int8_t *omnisci_data_bytes) override
 
void copy (const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination) override
 
std::shared_ptr< ChunkMetadatagetRowGroupMetadata (const parquet::RowGroupMetaData *group_metadata, const int parquet_column_index, const SQLTypeInfo &column_type) override
 
- Public Member Functions inherited from foreign_storage::ParquetInPlaceEncoder
 ParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
 
- Public Member Functions inherited from foreign_storage::ParquetScalarEncoder
 ParquetScalarEncoder (Data_Namespace::AbstractBuffer *buffer)
 
virtual void encodeAndCopy (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0
 
- Public Member Functions inherited from foreign_storage::ParquetEncoder
 ParquetEncoder (Data_Namespace::AbstractBuffer *buffer)
 
virtual ~ParquetEncoder ()=default
 

Protected Member Functions

virtual bool encodingIsIdentityForSameTypes () const
 
std::pair< T, T > getUnencodedStats (std::shared_ptr< parquet::Statistics > stats) const
 

Private Member Functions

std::pair< V, V > getEncodedStats (const parquet::ColumnDescriptor *parquet_column_descriptor, std::shared_ptr< parquet::Statistics > stats)
 

Static Private Member Functions

static ChunkStats getUpdatedStats (V &stats_min, V &stats_max, const SQLTypeInfo &column_type)
 

Private Attributes

int64_t current_batch_offset_ = 0
 

Additional Inherited Members

- Static Protected Member Functions inherited from foreign_storage::ParquetEncoder
static std::shared_ptr
< ChunkMetadata
createMetadata (const SQLTypeInfo &column_type)
 
static void throwNotNullViolation (const std::string &parquet_column_name)
 
static void validateNullCount (const std::string &parquet_column_name, int64_t null_count, const SQLTypeInfo &column_type)
 
- Protected Attributes inherited from foreign_storage::ParquetInPlaceEncoder
const size_t omnisci_data_type_byte_size_
 
const size_t parquet_data_type_byte_size_
 
- Protected Attributes inherited from foreign_storage::ParquetEncoder
Data_Namespace::AbstractBufferbuffer_
 

Detailed Description

template<typename V, typename T, typename NullType = V>
class foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >

Definition at line 111 of file ParquetInPlaceEncoder.h.

Constructor & Destructor Documentation

template<typename V, typename T, typename NullType = V>
foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::TypedParquetInPlaceEncoder ( Data_Namespace::AbstractBuffer buffer,
const ColumnDescriptor column_desciptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)
inline

Definition at line 113 of file ParquetInPlaceEncoder.h.

117  buffer,
118  sizeof(V),
119  parquet::GetTypeByteSize(parquet_column_descriptor->physical_type()))
120  , current_batch_offset_(0) {}
ParquetInPlaceEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
template<typename V, typename T, typename NullType = V>
foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::TypedParquetInPlaceEncoder ( Data_Namespace::AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size 
)
inline

Definition at line 122 of file ParquetInPlaceEncoder.h.

125  : ParquetInPlaceEncoder(buffer, sizeof(V), parquet_data_type_byte_size)
126  , current_batch_offset_(0) {}
ParquetInPlaceEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)

Member Function Documentation

template<typename V, typename T, typename NullType = V>
void foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::appendData ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values 
)
inlineoverridevirtual

This is a specialization of ParquetInPlaceEncoder::appendData for known types that allows for optimization.

See comment for ParquetInPlaceEncoder::appendData for details.

Reimplemented from foreign_storage::ParquetInPlaceEncoder.

Definition at line 181 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::ParquetStringEncoder< V >::appendData(), and foreign_storage::TypedParquetInPlaceEncoder< V, V >::validateAndAppendData().

185  {
186  if (std::is_same<V, T>::value && values_read == levels_read) {
188  for (int64_t i = 0; i < levels_read; ++i) {
190  values + i * omnisci_data_type_byte_size_);
191  }
192  }
193  buffer_->append(values, levels_read * omnisci_data_type_byte_size_);
194  } else {
196  def_levels, rep_levels, values_read, levels_read, values);
197  }
198  }
virtual void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
Data_Namespace::AbstractBuffer * buffer_
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override

+ Here is the caller graph for this function:

template<typename V, typename T, typename NullType = V>
void foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::copy ( const int8_t *  omnisci_data_bytes_source,
int8_t *  omnisci_data_bytes_destination 
)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 216 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::ParquetStringEncoder< V >::encodeAndCopy().

217  {
218  const auto& omnisci_data_value_source =
219  reinterpret_cast<const V*>(omnisci_data_bytes_source)[0];
220  auto& omnisci_data_value_destination =
221  reinterpret_cast<V*>(omnisci_data_bytes_destination)[0];
222  omnisci_data_value_destination = omnisci_data_value_source;
223  }

+ Here is the caller graph for this function:

template<typename V, typename T, typename NullType = V>
void foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::encodeAndCopyContiguous ( const int8_t *  parquet_data_bytes,
int8_t *  omnisci_data_bytes,
const size_t  num_elements 
)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 200 of file ParquetInPlaceEncoder.h.

202  {
203  auto parquet_data_ptr = reinterpret_cast<const T*>(parquet_data_bytes);
204  auto omnisci_data_ptr = reinterpret_cast<V*>(omnisci_data_bytes);
205  for (size_t i = 0; i < num_elements; ++i) {
206  encodeAndCopy(reinterpret_cast<const int8_t*>(&parquet_data_ptr[i]),
207  reinterpret_cast<int8_t*>(&omnisci_data_ptr[i]));
208  }
209  }
virtual void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0
template<typename V, typename T, typename NullType = V>
virtual bool foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::encodingIsIdentityForSameTypes ( ) const
inlineprotectedvirtual

Reimplemented in foreign_storage::ParquetStringEncoder< V >, foreign_storage::ParquetFixedLengthEncoder< V, T, NullType >, and foreign_storage::ParquetDecimalEncoder< V, T, NullType >.

Definition at line 274 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::TypedParquetInPlaceEncoder< V, V >::appendData().

274 { return false; }

+ Here is the caller graph for this function:

template<typename V, typename T, typename NullType = V>
void foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::eraseInvalidIndicesInBuffer ( const InvalidRowGroupIndices invalid_indices)
inlineoverridevirtual

Implements foreign_storage::ParquetImportEncoder.

Definition at line 156 of file ParquetInPlaceEncoder.h.

157  {
158  if (invalid_indices.empty()) {
159  return;
160  }
161  auto omnisci_data_values = reinterpret_cast<V*>(buffer_->getMemoryPtr());
163  size_t num_elements = buffer_->size() / omnisci_data_type_byte_size_;
164  std::remove_if(
165  omnisci_data_values, omnisci_data_values + num_elements, [&](const V& value) {
166  const V* start = omnisci_data_values;
167  auto index = std::distance(start, &value);
168  return invalid_indices.find(index) != invalid_indices.end();
169  });
170  size_t num_bytes_erased = invalid_indices.size() * omnisci_data_type_byte_size_;
171  CHECK(num_bytes_erased <= buffer_->size());
172  buffer_->setSize(buffer_->size() - num_bytes_erased);
173  }
virtual int8_t * getMemoryPtr()=0
void setSize(const size_t size)
#define CHECK(condition)
Definition: Logger.h:209
Data_Namespace::AbstractBuffer * buffer_
template<typename V, typename T, typename NullType = V>
std::pair<V, V> foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::getEncodedStats ( const parquet::ColumnDescriptor *  parquet_column_descriptor,
std::shared_ptr< parquet::Statistics >  stats 
)
inlineprivate

Definition at line 306 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::TypedParquetInPlaceEncoder< V, V >::getRowGroupMetadata().

308  {
309  V stats_min, stats_max;
310  auto min_string = stats->EncodeMin();
311  auto max_string = stats->EncodeMax();
312  if (parquet_column_descriptor->physical_type() ==
313  parquet::Type::FIXED_LEN_BYTE_ARRAY) {
314  parquet::FixedLenByteArray min_byte_array, max_byte_array;
315  min_byte_array.ptr = reinterpret_cast<const uint8_t*>(min_string.data());
316  max_byte_array.ptr = reinterpret_cast<const uint8_t*>(max_string.data());
317  encodeAndCopy(reinterpret_cast<int8_t*>(&min_byte_array),
318  reinterpret_cast<int8_t*>(&stats_min));
319  encodeAndCopy(reinterpret_cast<int8_t*>(&max_byte_array),
320  reinterpret_cast<int8_t*>(&stats_max));
321  } else if (parquet_column_descriptor->physical_type() == parquet::Type::BYTE_ARRAY) {
322  parquet::ByteArray min_byte_array, max_byte_array;
323  min_byte_array.ptr = reinterpret_cast<const uint8_t*>(min_string.data());
324  min_byte_array.len = min_string.length();
325  max_byte_array.ptr = reinterpret_cast<const uint8_t*>(max_string.data());
326  max_byte_array.len = max_string.length();
327  encodeAndCopy(reinterpret_cast<int8_t*>(&min_byte_array),
328  reinterpret_cast<int8_t*>(&stats_min));
329  encodeAndCopy(reinterpret_cast<int8_t*>(&max_byte_array),
330  reinterpret_cast<int8_t*>(&stats_max));
331  } else {
332  encodeAndCopy(reinterpret_cast<int8_t*>(min_string.data()),
333  reinterpret_cast<int8_t*>(&stats_min));
334  encodeAndCopy(reinterpret_cast<int8_t*>(max_string.data()),
335  reinterpret_cast<int8_t*>(&stats_max));
336  }
337  return {stats_min, stats_max};
338  }
virtual void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0

+ Here is the caller graph for this function:

template<typename V, typename T, typename NullType = V>
std::shared_ptr<ChunkMetadata> foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::getRowGroupMetadata ( const parquet::RowGroupMetaData *  group_metadata,
const int  parquet_column_index,
const SQLTypeInfo column_type 
)
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Definition at line 225 of file ParquetInPlaceEncoder.h.

228  {
229  auto metadata = ParquetEncoder::createMetadata(column_type);
230  auto column_metadata = group_metadata->ColumnChunk(parquet_column_index);
231 
232  // update statistics
233  auto parquet_column_descriptor =
234  group_metadata->schema()->Column(parquet_column_index);
235  auto stats = validate_and_get_column_metadata_statistics(column_metadata.get());
236  if (stats->HasMinMax()) {
237  // validate statistics if validation applicable as part of encoding
238  if (auto parquet_scalar_validator = dynamic_cast<ParquetMetadataValidator*>(this)) {
239  try {
240  parquet_scalar_validator->validate(
241  stats, column_type.is_array() ? column_type.get_elem_type() : column_type);
242  } catch (const std::exception& e) {
243  std::stringstream error_message;
244  error_message << e.what() << " Error validating statistics of Parquet column '"
245  << group_metadata->schema()->Column(parquet_column_index)->name()
246  << "'";
247  throw std::runtime_error(error_message.str());
248  }
249  }
250 
251  auto [stats_min, stats_max] = getEncodedStats(parquet_column_descriptor, stats);
252  auto updated_chunk_stats = getUpdatedStats(stats_min, stats_max, column_type);
253  metadata->fillChunkStats(updated_chunk_stats.min,
254  updated_chunk_stats.max,
255  metadata->chunkStats.has_nulls);
256  }
257  auto null_count = stats->null_count();
258  validateNullCount(group_metadata->schema()->Column(parquet_column_index)->name(),
259  null_count,
260  column_type);
261  metadata->chunkStats.has_nulls = null_count > 0;
262 
263  // update sizing
264  metadata->numBytes =
265  sizeof(NullType) // use NullType byte size since it is guaranteed to
266  // be the byte size of stored data
267  * column_metadata->num_values();
268  metadata->numElements = group_metadata->num_rows();
269 
270  return metadata;
271  }
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
static void validateNullCount(const std::string &parquet_column_name, int64_t null_count, const SQLTypeInfo &column_type)
static std::shared_ptr< ChunkMetadata > createMetadata(const SQLTypeInfo &column_type)
std::pair< V, V > getEncodedStats(const parquet::ColumnDescriptor *parquet_column_descriptor, std::shared_ptr< parquet::Statistics > stats)
static ChunkStats getUpdatedStats(V &stats_min, V &stats_max, const SQLTypeInfo &column_type)
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:850
bool is_array() const
Definition: sqltypes.h:517
template<typename V, typename T, typename NullType = V>
std::pair<T, T> foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::getUnencodedStats ( std::shared_ptr< parquet::Statistics >  stats) const
inlineprotected

Definition at line 276 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::ParquetDateInSecondsEncoder< NullType >::validate(), foreign_storage::ParquetTimestampEncoder< V, T, conversion_denominator, NullType >::validate(), foreign_storage::ParquetUnsignedFixedLengthEncoder< V, T, U, NullType >::validate(), and foreign_storage::ParquetFixedLengthEncoder< V, T, NullType >::validateIntegralOrFloatingPointMetadata().

276  {
277  T stats_min = reinterpret_cast<T*>(stats->EncodeMin().data())[0];
278  T stats_max = reinterpret_cast<T*>(stats->EncodeMax().data())[0];
279  return {stats_min, stats_max};
280  }

+ Here is the caller graph for this function:

template<typename V, typename T, typename NullType = V>
static ChunkStats foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::getUpdatedStats ( V &  stats_min,
V &  stats_max,
const SQLTypeInfo column_type 
)
inlinestaticprivate

Definition at line 283 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::TypedParquetInPlaceEncoder< V, V >::getRowGroupMetadata().

285  {
286  ForeignStorageBuffer buffer;
287  buffer.initEncoder(column_type);
288  auto encoder = buffer.getEncoder();
289 
290  if (column_type.is_array()) {
291  ArrayDatum min_datum(
292  sizeof(V), reinterpret_cast<int8_t*>(&stats_min), false, DoNothingDeleter());
293  ArrayDatum max_datum(
294  sizeof(V), reinterpret_cast<int8_t*>(&stats_max), false, DoNothingDeleter());
295  std::vector<ArrayDatum> min_max_datums{min_datum, max_datum};
296  encoder->updateStats(&min_max_datums, 0, 1);
297  } else {
298  encoder->updateStats(reinterpret_cast<int8_t*>(&stats_min), 1);
299  encoder->updateStats(reinterpret_cast<int8_t*>(&stats_max), 1);
300  }
301  auto updated_chunk_stats_metadata = std::make_shared<ChunkMetadata>();
302  encoder->getMetadata(updated_chunk_stats_metadata);
303  return updated_chunk_stats_metadata->chunkStats;
304  }
void initEncoder(const SQLTypeInfo &tmp_sql_type)
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:208
bool is_array() const
Definition: sqltypes.h:517

+ Here is the caller graph for this function:

template<typename V, typename T, typename NullType = V>
void foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::setNull ( int8_t *  omnisci_data_bytes)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 211 of file ParquetInPlaceEncoder.h.

211  {
212  auto& omnisci_data_value = reinterpret_cast<V*>(omnisci_data_bytes)[0];
213  omnisci_data_value = get_null_value<NullType>();
214  }
template<typename V, typename T, typename NullType = V>
void foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::validate ( const int8_t *  parquet_data,
const int64_t  j,
const SQLTypeInfo column_type 
) const
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Reimplemented in foreign_storage::ParquetTimestampEncoder< V, T, conversion_denominator, NullType >.

Definition at line 128 of file ParquetInPlaceEncoder.h.

Referenced by foreign_storage::TypedParquetInPlaceEncoder< V, V >::validateAndAppendData().

130  {
131  // no-op by default
132  }

+ Here is the caller graph for this function:

template<typename V, typename T, typename NullType = V>
void foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::validateAndAppendData ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values,
const SQLTypeInfo column_type,
InvalidRowGroupIndices invalid_indices 
)
inlineoverridevirtual

Implements foreign_storage::ParquetImportEncoder.

Definition at line 134 of file ParquetInPlaceEncoder.h.

140  {
141  int64_t i, j;
142  for (i = 0, j = 0; i < levels_read; ++i) {
143  if (def_levels[i]) {
144  try {
145  CHECK(j < values_read);
146  validate(values, j++, column_type);
147  } catch (const std::runtime_error& error) {
148  invalid_indices.insert(current_batch_offset_ + i);
149  }
150  }
151  }
152  current_batch_offset_ += levels_read;
153  appendData(def_levels, rep_levels, values_read, levels_read, values);
154  }
void validate(const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const override
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
#define CHECK(condition)
Definition: Logger.h:209

Member Data Documentation

template<typename V, typename T, typename NullType = V>
int64_t foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::current_batch_offset_ = 0
private

The documentation for this class was generated from the following file: