OmniSciDB  471d68cefb
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp} Namespace Reference

Functions

bool is_valid_parquet_string (const parquet::ColumnDescriptor *parquet_column)
 
bool is_valid_parquet_list_column (const parquet::ColumnDescriptor *parquet_column)
 Detect a valid list parquet column. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder_with_omnisci_type (const ColumnDescriptor *column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename U , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_signed_or_unsigned_integral_encoder_with_types (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
 Create a signed or unsigned integral parquet encoder using types. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder_with_omnisci_type (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const int bit_width, const bool is_signed)
 Create a integral parquet encoder using types. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_floating_point_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_none_type_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_string_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_geospatial_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_array_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan=false, const bool is_for_import=false)
 Create a Parquet specific encoder for a Parquet to OmniSci mapping. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_import (std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_metadata_scan (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
void validate_definition_levels (const parquet::ParquetFileReader *reader, const int row_group_index, const int column_index, const int16_t *def_levels, const int64_t num_levels, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void validate_max_repetition_and_definition_level (const ColumnDescriptor *omnisci_column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void resize_values_buffer (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::vector< int8_t > &values)
 
bool validate_decimal_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_floating_point_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_integral_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool is_nanosecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_nanosecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_microsecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_microsecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_millisecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_millisecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool validate_none_type_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_timestamp_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_time_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_date_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_string_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_array_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_geospatial_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
void validate_equal_schema (const parquet::arrow::FileReader *reference_file_reader, const parquet::arrow::FileReader *new_file_reader, const std::string &reference_file_path, const std::string &new_file_path)
 
void validate_allowed_mapping (const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)
 
void validate_number_of_columns (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
void throw_missing_metadata_error (const int row_group_index, const int column_index, const std::string &file_path)
 
void throw_row_group_larger_than_fragment_size_error (const int row_group_index, const int64_t max_row_group_size, const int fragment_size, const std::string &file_path)
 
void validate_column_mapping_and_row_group_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
void validate_parquet_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
std::list< RowGroupMetadatametadata_scan_rowgroup_interval (const std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const RowGroupInterval &row_group_interval, const ReaderPtr &reader, const ForeignTableSchema &schema)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_import (const std::map< int, Chunk_NS::Chunk > chunks, const ForeignTableSchema &schema, const ReaderPtr &reader, const std::map< int, StringDictionary * > column_dictionaries)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_metadata_scan (const Interval< ColumnType > &column_interval, const ForeignTableSchema &schema, const ReaderPtr &reader)
 

Function Documentation

std::shared_ptr< ParquetEncoder > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import 
)

Definition at line 988 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_encoder(), foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::is_array(), SQLTypeInfo::is_fixlen_array(), and is_valid_parquet_list_column().

Referenced by create_parquet_encoder().

995  {
996  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column);
997  if (!is_valid_parquet_list || !omnisci_column->columnType.is_array()) {
998  return {};
999  }
1000  std::unique_ptr<ColumnDescriptor> omnisci_column_sub_type_column =
1001  get_sub_type_column_descriptor(omnisci_column);
1002  auto encoder = create_parquet_encoder(omnisci_column_sub_type_column.get(),
1003  parquet_column,
1004  chunks,
1005  string_dictionary,
1006  chunk_metadata,
1007  is_metadata_scan,
1008  is_for_import);
1009  CHECK(encoder.get());
1010  auto scalar_encoder = std::dynamic_pointer_cast<ParquetScalarEncoder>(encoder);
1011  CHECK(scalar_encoder);
1012  if (!is_for_import) {
1013  if (omnisci_column->columnType.is_fixlen_array()) {
1014  encoder = std::make_shared<ParquetFixedLengthArrayEncoder>(
1015  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1016  scalar_encoder,
1017  omnisci_column);
1018  } else {
1019  encoder = std::make_shared<ParquetVariableLengthArrayEncoder>(
1020  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1021  is_metadata_scan ? nullptr : chunks.begin()->getIndexBuf(),
1022  scalar_encoder,
1023  omnisci_column);
1024  }
1025  } else { // is_for_import
1026  encoder = std::make_shared<ParquetArrayImportEncoder>(
1027  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1028  }
1029  return encoder;
1030 }
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
bool is_fixlen_array() const
Definition: sqltypes.h:519
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan=false, const bool is_for_import=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType
bool is_array() const
Definition: sqltypes.h:517

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 726 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, kENCODING_DATE_IN_DAYS, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

730  {
731  auto column_type = omnisci_column->columnType;
732  if (parquet_column->logical_type()->is_date() && column_type.is_date()) {
733  if (column_type.get_compression() == kENCODING_DATE_IN_DAYS) {
734  if (is_metadata_scan_or_for_import) {
735  if (column_type.get_comp_param() ==
736  0) { // DATE ENCODING FIXED (32) uses comp param 0
737  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int32_t>>(
738  buffer);
739  } else if (column_type.get_comp_param() == 16) {
740  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int16_t>>(
741  buffer);
742  } else {
743  UNREACHABLE();
744  }
745  } else {
746  if (column_type.get_comp_param() ==
747  0) { // DATE ENCODING FIXED (32) uses comp param 0
748  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t>>(
749  buffer, omnisci_column, parquet_column);
750  } else if (column_type.get_comp_param() == 16) {
751  return std::make_shared<ParquetFixedLengthEncoder<int16_t, int32_t>>(
752  buffer, omnisci_column, parquet_column);
753  } else {
754  UNREACHABLE();
755  }
756  }
757  } else if (column_type.get_compression() == kENCODING_NONE) { // for array types
758  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int64_t>>(
759  buffer, omnisci_column, parquet_column);
760  } else {
761  UNREACHABLE();
762  }
763  }
764  return {};
765 }
#define UNREACHABLE()
Definition: Logger.h:253
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 683 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_date_from_timestamp_encoder_with_types(), kENCODING_DATE_IN_DAYS, and UNREACHABLE.

Referenced by create_parquet_encoder().

687  {
688  auto column_type = omnisci_column->columnType;
689  if (parquet_column->logical_type()->is_timestamp() && column_type.is_date()) {
690  CHECK(column_type.get_compression() == kENCODING_DATE_IN_DAYS);
691  if (is_metadata_scan_or_for_import) {
692  if (column_type.get_comp_param() ==
693  0) { // DATE ENCODING FIXED (32) uses comp param 0
695  int64_t,
696  int32_t>(
697  omnisci_column, parquet_column, buffer, true);
698  } else if (column_type.get_comp_param() == 16) {
700  int64_t,
701  int16_t>(
702  omnisci_column, parquet_column, buffer, true);
703  } else {
704  UNREACHABLE();
705  }
706  } else {
707  if (column_type.get_comp_param() ==
708  0) { // DATE ENCODING FIXED (32) uses comp param 0
710  int64_t,
711  int32_t>(
712  omnisci_column, parquet_column, buffer, false);
713  } else if (column_type.get_comp_param() == 16) {
715  int64_t,
716  int16_t>(
717  omnisci_column, parquet_column, buffer, false);
718  } else {
719  UNREACHABLE();
720  }
721  }
722  }
723  return {};
724 }
#define UNREACHABLE()
Definition: Logger.h:253
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder_with_types(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 499 of file LazyParquetChunkLoader.cpp.

References kSecsPerDay, omnisci.dtypes::T, and UNREACHABLE.

Referenced by create_parquet_date_from_timestamp_encoder().

503  {
504  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
505  parquet_column->logical_type().get())) {
506  switch (timestamp_logical_type->time_unit()) {
507  case parquet::LogicalType::TimeUnit::MILLIS:
508  if (is_metadata_scan_or_for_import) {
509  return std::make_shared<ParquetDateFromTimestampEncoder<V, T, 1000L, NullType>>(
510  buffer, omnisci_column, parquet_column);
511  }
512  return std::make_shared<
513  ParquetDateFromTimestampEncoder<V, T, 1000L * kSecsPerDay, NullType>>(
514  buffer, omnisci_column, parquet_column);
515  case parquet::LogicalType::TimeUnit::MICROS:
516  if (is_metadata_scan_or_for_import) {
517  return std::make_shared<
518  ParquetDateFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
519  buffer, omnisci_column, parquet_column);
520  }
521  return std::make_shared<
522  ParquetDateFromTimestampEncoder<V, T, 1000L * 1000L * kSecsPerDay, NullType>>(
523  buffer, omnisci_column, parquet_column);
524  case parquet::LogicalType::TimeUnit::NANOS:
525  if (is_metadata_scan_or_for_import) {
526  return std::make_shared<
527  ParquetDateFromTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
528  buffer, omnisci_column, parquet_column);
529  }
530  return std::make_shared<
531  ParquetDateFromTimestampEncoder<V,
532  T,
533  1000L * 1000L * 1000L * kSecsPerDay,
534  NullType>>(
535  buffer, omnisci_column, parquet_column);
536  default:
537  UNREACHABLE();
538  }
539  } else {
540  UNREACHABLE();
541  }
542  return {};
543 }
static constexpr int64_t kSecsPerDay
#define UNREACHABLE()
Definition: Logger.h:253

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 162 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

166  {
167  if (parquet_column->logical_type()->is_decimal()) {
168  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
169  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int64_t>(
170  omnisci_column, parquet_column, buffer);
171  }
172  CHECK(omnisci_column->columnType.get_compression() == kENCODING_FIXED);
173  if (is_metadata_scan_or_for_import) {
174  switch (omnisci_column->columnType.get_comp_param()) {
175  case 16:
176  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int16_t>(
177  omnisci_column, parquet_column, buffer);
178  case 32:
179  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int32_t>(
180  omnisci_column, parquet_column, buffer);
181  default:
182  UNREACHABLE();
183  }
184  } else {
185  switch (omnisci_column->columnType.get_comp_param()) {
186  case 16:
187  return create_parquet_decimal_encoder_with_omnisci_type<int16_t, int16_t>(
188  omnisci_column, parquet_column, buffer);
189  case 32:
190  return create_parquet_decimal_encoder_with_omnisci_type<int32_t, int32_t>(
191  omnisci_column, parquet_column, buffer);
192  default:
193  UNREACHABLE();
194  }
195  }
196  }
197  return {};
198 }
#define UNREACHABLE()
Definition: Logger.h:253
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder_with_omnisci_type ( const ColumnDescriptor column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor,
AbstractBuffer buffer 
)

Definition at line 138 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

141  {
142  switch (parquet_column_descriptor->physical_type()) {
143  case parquet::Type::INT32:
144  return std::make_shared<ParquetDecimalEncoder<V, int32_t, NullType>>(
145  buffer, column_descriptor, parquet_column_descriptor);
146  case parquet::Type::INT64:
147  return std::make_shared<ParquetDecimalEncoder<V, int64_t, NullType>>(
148  buffer, column_descriptor, parquet_column_descriptor);
149  case parquet::Type::FIXED_LEN_BYTE_ARRAY:
150  return std::make_shared<
151  ParquetDecimalEncoder<V, parquet::FixedLenByteArray, NullType>>(
152  buffer, column_descriptor, parquet_column_descriptor);
153  case parquet::Type::BYTE_ARRAY:
154  return std::make_shared<ParquetDecimalEncoder<V, parquet::ByteArray, NullType>>(
155  buffer, column_descriptor, parquet_column_descriptor);
156  default:
157  UNREACHABLE();
158  }
159  return {};
160 }
#define UNREACHABLE()
Definition: Logger.h:253
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan = false,
const bool  is_for_import = false 
)

Create a Parquet specific encoder for a Parquet to OmniSci mapping.

Parameters
omnisci_column- the descriptor of OmniSci column
parquet_column- the descriptor of Parquet column
chunks- list of chunks to populate (the case of more than one chunk happens only if a logical column expands to multiple physical columns)
string_dictionary- string dictionary used in encoding for string dictionary encoded columns
chunk_metadata- similar to the list of chunks, a list of chunk metadata that is populated
is_metadata_scan- a flag indicating if the encoders created should be for a metadata scan
is_for_import- a flag indicating if the encoders created should be for import
Returns
An appropriate Parquet encoder for the use case defined by the Parquet to OmniSci mapping.

Notes:

  • In the case of a metadata scan, the type of the encoder created may significantly change (for example in bit width.) This is because it is common for OmniSci to store metadata in a different format altogether than the data itself (see for example FixedLengthEncoder.)
  • This function and the function isColumnMappingSupported work in conjunction with each other. For example, once a mapping is known to be allowed (since isColumnMappingSupported returned true) this function does not have to check many corner cases exhaustively as it would be redundant with what was checked in isColumnMappingSupported.

Definition at line 885 of file LazyParquetChunkLoader.cpp.

References CHECK, create_parquet_array_encoder(), create_parquet_date_encoder(), create_parquet_date_from_timestamp_encoder(), create_parquet_decimal_encoder(), create_parquet_floating_point_encoder(), create_parquet_geospatial_encoder(), create_parquet_integral_encoder(), create_parquet_none_type_encoder(), create_parquet_string_encoder(), create_parquet_time_encoder(), create_parquet_timestamp_encoder(), and UNREACHABLE.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), create_parquet_array_encoder(), create_parquet_encoder_for_import(), and create_parquet_encoder_for_metadata_scan().

892  {
893  CHECK(!(is_metadata_scan && is_for_import));
894  auto buffer = chunks.empty() ? nullptr : chunks.begin()->getBuffer();
895  if (auto encoder = create_parquet_geospatial_encoder(omnisci_column,
896  parquet_column,
897  chunks,
898  chunk_metadata,
899  is_metadata_scan,
900  is_for_import)) {
901  return encoder;
902  }
903  if (auto encoder = create_parquet_array_encoder(omnisci_column,
904  parquet_column,
905  chunks,
906  string_dictionary,
907  chunk_metadata,
908  is_metadata_scan,
909  is_for_import)) {
910  return encoder;
911  }
912  if (auto encoder = create_parquet_decimal_encoder(
913  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
914  return encoder;
915  }
916  if (auto encoder = create_parquet_integral_encoder(
917  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
918  return encoder;
919  }
920  if (auto encoder =
921  create_parquet_floating_point_encoder(omnisci_column, parquet_column, buffer)) {
922  return encoder;
923  }
924  if (auto encoder = create_parquet_timestamp_encoder(
925  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
926  return encoder;
927  }
928  if (auto encoder =
929  create_parquet_none_type_encoder(omnisci_column, parquet_column, buffer)) {
930  return encoder;
931  }
932  if (auto encoder = create_parquet_time_encoder(
933  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
934  return encoder;
935  }
937  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
938  return encoder;
939  }
940  if (auto encoder = create_parquet_date_encoder(
941  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
942  return encoder;
943  }
944  if (auto encoder = create_parquet_string_encoder(
945  omnisci_column,
946  parquet_column,
947  chunks.empty() ? Chunk_NS::Chunk{} : *chunks.begin(),
948  string_dictionary,
949  chunk_metadata,
950  is_for_import)) {
951  return encoder;
952  }
953  UNREACHABLE();
954  return {};
955 }
std::shared_ptr< ParquetEncoder > create_parquet_array_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_geospatial_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import)
#define UNREACHABLE()
Definition: Logger.h:253
std::shared_ptr< ParquetEncoder > create_parquet_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_none_type_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
std::shared_ptr< ParquetEncoder > create_parquet_time_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_date_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_string_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_floating_point_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
#define CHECK(condition)
Definition: Logger.h:209
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_decimal_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_integral_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_import ( std::list< Chunk_NS::Chunk > &  chunks,
const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
StringDictionary string_dictionary 
)

Intended to be used for the import case.

Definition at line 960 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_import().

964  {
965  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
966  return create_parquet_encoder(omnisci_column,
967  parquet_column,
968  chunks,
969  string_dictionary,
970  chunk_metadata,
971  false,
972  true);
973 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan=false, const bool is_for_import=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_metadata_scan ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Intended to be used only with metadata scan. Creates an incomplete encoder capable of updating metadata.

Definition at line 979 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_metadata_scan().

981  {
982  std::list<Chunk_NS::Chunk> chunks;
983  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
984  return create_parquet_encoder(
985  omnisci_column, parquet_column, chunks, nullptr, chunk_metadata, true);
986 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan=false, const bool is_for_import=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_floating_point_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 418 of file LazyParquetChunkLoader.cpp.

References CHECK, CHECK_EQ, ColumnDescriptor::columnType, DOUBLE, FLOAT, kDOUBLE, kENCODING_NONE, kFLOAT, and UNREACHABLE.

Referenced by create_parquet_encoder().

421  {
422  auto column_type = omnisci_column->columnType;
423  if (!column_type.is_fp()) {
424  return {};
425  }
426  CHECK_EQ(column_type.get_compression(), kENCODING_NONE);
427  switch (column_type.get_type()) {
428  case kFLOAT:
429  switch (parquet_column->physical_type()) {
431  return std::make_shared<ParquetFixedLengthEncoder<float, float>>(
432  buffer, omnisci_column, parquet_column);
434  return std::make_shared<ParquetFixedLengthEncoder<float, double>>(
435  buffer, omnisci_column, parquet_column);
436  default:
437  UNREACHABLE();
438  }
439  case kDOUBLE:
440  CHECK(parquet_column->physical_type() == parquet::Type::DOUBLE);
441  return std::make_shared<ParquetFixedLengthEncoder<double, double>>(
442  buffer, omnisci_column, parquet_column);
443  default:
444  UNREACHABLE();
445  }
446  return {};
447 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
#define DOUBLE
#define UNREACHABLE()
Definition: Logger.h:253
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType
#define FLOAT

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_geospatial_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import 
)

Definition at line 815 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and is_valid_parquet_string().

Referenced by create_parquet_encoder().

821  {
822  auto column_type = omnisci_column->columnType;
823  if (!is_valid_parquet_string(parquet_column) || !column_type.is_geometry()) {
824  return {};
825  }
826  if (is_for_import) {
827  return std::make_shared<ParquetGeospatialImportEncoder>(chunks);
828  }
829  if (is_metadata_scan) {
830  return std::make_shared<ParquetGeospatialEncoder>();
831  }
832  for (auto chunks_iter = chunks.begin(); chunks_iter != chunks.end(); ++chunks_iter) {
833  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
834  auto& chunk_metadata_ptr = chunk_metadata.back();
835  chunk_metadata_ptr->sqlType = chunks_iter->getColumnDesc()->columnType;
836  }
837  return std::make_shared<ParquetGeospatialEncoder>(
838  parquet_column, chunks, chunk_metadata);
839 }
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 288 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, and UNREACHABLE.

Referenced by create_parquet_encoder().

292  {
293  auto column_type = omnisci_column->columnType;
294  auto physical_type = parquet_column->physical_type();
295 
296  int bit_width = -1;
297  int is_signed = false;
298  // handle the integral case with no Parquet annotation
299  if (parquet_column->logical_type()->is_none() && column_type.is_integer()) {
300  if (physical_type == parquet::Type::INT32) {
301  bit_width = 32;
302  } else if (physical_type == parquet::Type::INT64) {
303  bit_width = 64;
304  } else {
305  UNREACHABLE();
306  }
307  is_signed = true;
308  }
309  // handle the integral case with Parquet annotation
310  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
311  parquet_column->logical_type().get())) {
312  bit_width = int_logical_column->bit_width();
313  is_signed = int_logical_column->is_signed();
314  }
315 
316  if (bit_width == -1) { // no valid logical type (with or without annotation) found
317  return {};
318  }
319 
320  const size_t omnisci_data_type_byte_size = column_type.get_size();
321  const size_t parquet_data_type_byte_size = parquet::GetTypeByteSize(physical_type);
322 
323  switch (omnisci_data_type_byte_size) {
324  case 8:
325  CHECK(column_type.get_compression() == kENCODING_NONE);
326  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int64_t>(
327  buffer,
328  omnisci_data_type_byte_size,
329  parquet_data_type_byte_size,
330  bit_width,
331  is_signed);
332  case 4:
333  if (is_metadata_scan_or_for_import && column_type.get_type() == kBIGINT) {
334  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int32_t>(
335  buffer,
336  omnisci_data_type_byte_size,
337  parquet_data_type_byte_size,
338  bit_width,
339  is_signed);
340  }
341  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int32_t>(
342  buffer,
343  omnisci_data_type_byte_size,
344  parquet_data_type_byte_size,
345  bit_width,
346  is_signed);
347  case 2:
348  if (is_metadata_scan_or_for_import) {
349  switch (column_type.get_type()) {
350  case kBIGINT:
351  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int16_t>(
352  buffer,
353  omnisci_data_type_byte_size,
354  parquet_data_type_byte_size,
355  bit_width,
356  is_signed);
357  case kINT:
358  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int16_t>(
359  buffer,
360  omnisci_data_type_byte_size,
361  parquet_data_type_byte_size,
362  bit_width,
363  is_signed);
364  case kSMALLINT:
365  break;
366  default:
367  UNREACHABLE();
368  }
369  }
370  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int16_t>(
371  buffer,
372  omnisci_data_type_byte_size,
373  parquet_data_type_byte_size,
374  bit_width,
375  is_signed);
376  case 1:
377  if (is_metadata_scan_or_for_import) {
378  switch (column_type.get_type()) {
379  case kBIGINT:
380  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int8_t>(
381  buffer,
382  omnisci_data_type_byte_size,
383  parquet_data_type_byte_size,
384  bit_width,
385  is_signed);
386  case kINT:
387  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int8_t>(
388  buffer,
389  omnisci_data_type_byte_size,
390  parquet_data_type_byte_size,
391  bit_width,
392  is_signed);
393  case kSMALLINT:
394  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int8_t>(
395  buffer,
396  omnisci_data_type_byte_size,
397  parquet_data_type_byte_size,
398  bit_width,
399  is_signed);
400  case kTINYINT:
401  break;
402  default:
403  UNREACHABLE();
404  }
405  }
406  return create_parquet_integral_encoder_with_omnisci_type<int8_t, int8_t>(
407  buffer,
408  omnisci_data_type_byte_size,
409  parquet_data_type_byte_size,
410  bit_width,
411  is_signed);
412  default:
413  UNREACHABLE();
414  }
415  return {};
416 }
#define UNREACHABLE()
Definition: Logger.h:253
#define CHECK(condition)
Definition: Logger.h:209
Definition: sqltypes.h:45
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder_with_omnisci_type ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const int  bit_width,
const bool  is_signed 
)

Create a integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
bit_width- bit width specified for the Parquet column
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated type V and NullType.

Note, this function determines the appropriate bit depth integral encoder to create, while create_parquet_signed_or_unsigned_integral_encoder_with_types determines whether to create a signed or unsigned integral encoder.

Definition at line 251 of file LazyParquetChunkLoader.cpp.

References create_parquet_signed_or_unsigned_integral_encoder_with_types(), and UNREACHABLE.

256  {
257  switch (bit_width) {
258  case 8:
260  int32_t,
261  uint8_t,
262  NullType>(
263  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
264  case 16:
266  int32_t,
267  uint16_t,
268  NullType>(
269  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
270  case 32:
272  int32_t,
273  uint32_t,
274  NullType>(
275  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
276  case 64:
278  int64_t,
279  uint64_t,
280  NullType>(
281  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
282  default:
283  UNREACHABLE();
284  }
285  return {};
286 }
std::shared_ptr< ParquetEncoder > create_parquet_signed_or_unsigned_integral_encoder_with_types(AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
Create a signed or unsigned integral parquet encoder using types.
#define UNREACHABLE()
Definition: Logger.h:253

+ Here is the call graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_none_type_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 449 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_string(), kBOOLEAN, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

452  {
453  auto column_type = omnisci_column->columnType;
454  if (parquet_column->logical_type()->is_none() &&
455  !omnisci_column->columnType.is_string()) { // boolean
456  if (column_type.get_compression() == kENCODING_NONE) {
457  switch (column_type.get_type()) {
458  case kBOOLEAN:
459  return std::make_shared<ParquetFixedLengthEncoder<int8_t, bool>>(
460  buffer, omnisci_column, parquet_column);
461  default:
462  UNREACHABLE();
463  }
464  } else {
465  UNREACHABLE();
466  }
467  }
468  return {};
469 }
#define UNREACHABLE()
Definition: Logger.h:253
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:509

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename U , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_signed_or_unsigned_integral_encoder_with_types ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const bool  is_signed 
)

Create a signed or unsigned integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated types V, T, U, and NullType.

Definition at line 216 of file LazyParquetChunkLoader.cpp.

References CHECK.

Referenced by create_parquet_integral_encoder_with_omnisci_type().

220  {
221  CHECK(sizeof(NullType) == omnisci_data_type_byte_size);
222  if (is_signed) {
223  return std::make_shared<ParquetFixedLengthEncoder<V, T, NullType>>(
224  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
225  } else {
226  return std::make_shared<ParquetUnsignedFixedLengthEncoder<V, T, U, NullType>>(
227  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
228  }
229 }
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_string_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const Chunk_NS::Chunk chunk,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
bool  is_for_import 
)

Definition at line 767 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, Chunk_NS::Chunk::getBuffer(), Chunk_NS::Chunk::getIndexBuf(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

773  {
774  auto column_type = omnisci_column->columnType;
775  if (!is_valid_parquet_string(parquet_column) ||
776  !omnisci_column->columnType.is_string()) {
777  return {};
778  }
779  if (column_type.get_compression() == kENCODING_NONE) {
780  if (is_for_import) {
781  return std::make_shared<ParquetStringImportEncoder>(chunk.getBuffer());
782  } else {
783  return std::make_shared<ParquetStringNoneEncoder>(chunk.getBuffer(),
784  chunk.getIndexBuf());
785  }
786  } else if (column_type.get_compression() == kENCODING_DICT) {
787  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
788  std::unique_ptr<ChunkMetadata>& logical_chunk_metadata = chunk_metadata.back();
789  logical_chunk_metadata->sqlType = omnisci_column->columnType;
790  switch (column_type.get_size()) {
791  case 1:
792  return std::make_shared<ParquetStringEncoder<uint8_t>>(
793  chunk.getBuffer(),
794  string_dictionary,
795  is_for_import ? nullptr : logical_chunk_metadata.get());
796  case 2:
797  return std::make_shared<ParquetStringEncoder<uint16_t>>(
798  chunk.getBuffer(),
799  string_dictionary,
800  is_for_import ? nullptr : logical_chunk_metadata.get());
801  case 4:
802  return std::make_shared<ParquetStringEncoder<int32_t>>(
803  chunk.getBuffer(),
804  string_dictionary,
805  is_for_import ? nullptr : logical_chunk_metadata.get());
806  default:
807  UNREACHABLE();
808  }
809  } else {
810  UNREACHABLE();
811  }
812  return {};
813 }
AbstractBuffer * getIndexBuf() const
Definition: Chunk.h:109
#define UNREACHABLE()
Definition: Logger.h:253
AbstractBuffer * getBuffer() const
Definition: Chunk.h:107
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:509

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 630 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

634  {
635  auto column_type = omnisci_column->columnType;
636  if (auto time_logical_column = dynamic_cast<const parquet::TimeLogicalType*>(
637  parquet_column->logical_type().get())) {
638  if (column_type.get_compression() == kENCODING_NONE) {
639  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
640  return create_parquet_time_encoder_with_types<int64_t, int32_t, int64_t>(
641  omnisci_column, parquet_column, buffer);
642  } else {
643  return create_parquet_time_encoder_with_types<int64_t, int64_t, int64_t>(
644  omnisci_column, parquet_column, buffer);
645  }
646  } else if (column_type.get_compression() == kENCODING_FIXED) {
647  if (is_metadata_scan_or_for_import) {
648  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
649  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
650  return create_parquet_time_encoder_with_types<int64_t, int32_t, int32_t>(
651  omnisci_column, parquet_column, buffer);
652  } else {
653  CHECK(time_logical_column->time_unit() ==
654  parquet::LogicalType::TimeUnit::MICROS ||
655  time_logical_column->time_unit() ==
656  parquet::LogicalType::TimeUnit::NANOS);
657  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
658  return create_parquet_time_encoder_with_types<int64_t, int64_t, int32_t>(
659  omnisci_column, parquet_column, buffer);
660  }
661  } else {
662  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
663  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
664  return create_parquet_time_encoder_with_types<int32_t, int32_t, int32_t>(
665  omnisci_column, parquet_column, buffer);
666  } else {
667  CHECK(time_logical_column->time_unit() ==
668  parquet::LogicalType::TimeUnit::MICROS ||
669  time_logical_column->time_unit() ==
670  parquet::LogicalType::TimeUnit::NANOS);
671  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
672  return create_parquet_time_encoder_with_types<int32_t, int64_t, int32_t>(
673  omnisci_column, parquet_column, buffer);
674  }
675  }
676  } else {
677  UNREACHABLE();
678  }
679  }
680  return {};
681 }
#define UNREACHABLE()
Definition: Logger.h:253
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 604 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

607  {
608  if (auto time_logical_type = dynamic_cast<const parquet::TimeLogicalType*>(
609  parquet_column->logical_type().get())) {
610  switch (time_logical_type->time_unit()) {
611  case parquet::LogicalType::TimeUnit::MILLIS:
612  return std::make_shared<ParquetTimeEncoder<V, T, 1000L, NullType>>(
613  buffer, omnisci_column, parquet_column);
614  case parquet::LogicalType::TimeUnit::MICROS:
615  return std::make_shared<ParquetTimeEncoder<V, T, 1000L * 1000L, NullType>>(
616  buffer, omnisci_column, parquet_column);
617  case parquet::LogicalType::TimeUnit::NANOS:
618  return std::make_shared<
619  ParquetTimeEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
620  buffer, omnisci_column, parquet_column);
621  default:
622  UNREACHABLE();
623  }
624  } else {
625  UNREACHABLE();
626  }
627  return {};
628 }
#define UNREACHABLE()
Definition: Logger.h:253
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 545 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_precision(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

549  {
550  auto column_type = omnisci_column->columnType;
551  auto precision = column_type.get_precision();
552  if (parquet_column->logical_type()->is_timestamp()) {
553  if (column_type.get_compression() == kENCODING_NONE) {
554  if (precision == 0) {
555  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int64_t>(
556  omnisci_column, parquet_column, buffer);
557  } else {
558  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
559  buffer, omnisci_column, parquet_column);
560  }
561  } else if (column_type.get_compression() == kENCODING_FIXED) {
562  CHECK(column_type.get_comp_param() == 32);
563  if (is_metadata_scan_or_for_import) {
564  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int32_t>(
565  omnisci_column, parquet_column, buffer);
566  } else {
567  return create_parquet_timestamp_encoder_with_types<int32_t, int64_t, int32_t>(
568  omnisci_column, parquet_column, buffer);
569  }
570  }
571  } else if (parquet_column->logical_type()->is_none() && column_type.is_timestamp()) {
572  if (parquet_column->physical_type() == parquet::Type::INT32) {
573  CHECK(column_type.get_compression() == kENCODING_FIXED &&
574  column_type.get_comp_param() == 32);
575  if (is_metadata_scan_or_for_import) {
576  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int32_t, int32_t>>(
577  buffer, omnisci_column, parquet_column);
578  } else {
579  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t, int32_t>>(
580  buffer, omnisci_column, parquet_column);
581  }
582  } else if (parquet_column->physical_type() == parquet::Type::INT64) {
583  if (column_type.get_compression() == kENCODING_NONE) {
584  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
585  buffer, omnisci_column, parquet_column);
586  } else if (column_type.get_compression() == kENCODING_FIXED) {
587  CHECK(column_type.get_comp_param() == 32);
588  if (is_metadata_scan_or_for_import) {
589  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int32_t>>(
590  buffer, omnisci_column, parquet_column);
591  } else {
592  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int64_t, int32_t>>(
593  buffer, omnisci_column, parquet_column);
594  }
595  }
596  } else {
597  UNREACHABLE();
598  }
599  }
600  return {};
601 }
#define UNREACHABLE()
Definition: Logger.h:253
int get_precision() const
Definition: sqltypes.h:332
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 472 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

475  {
476  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
477  parquet_column->logical_type().get())) {
478  switch (timestamp_logical_type->time_unit()) {
479  case parquet::LogicalType::TimeUnit::MILLIS:
480  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L, NullType>>(
481  buffer, omnisci_column, parquet_column);
482  case parquet::LogicalType::TimeUnit::MICROS:
483  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
484  buffer, omnisci_column, parquet_column);
485  case parquet::LogicalType::TimeUnit::NANOS:
486  return std::make_shared<
487  ParquetTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
488  buffer, omnisci_column, parquet_column);
489  default:
490  UNREACHABLE();
491  }
492  } else {
493  UNREACHABLE();
494  }
495  return {};
496 }
#define UNREACHABLE()
Definition: Logger.h:253
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1175 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by validate_timestamp_mapping().

1175  {
1176  return omnisci_column->columnType.get_dimension() == 6;
1177 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:331
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1179 of file LazyParquetChunkLoader.cpp.

1180  {
1181  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MICROS;
1182 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1184 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by validate_timestamp_mapping().

1184  {
1185  return omnisci_column->columnType.get_dimension() == 3;
1186 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:331
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1188 of file LazyParquetChunkLoader.cpp.

1189  {
1190  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS;
1191 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1166 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by validate_timestamp_mapping().

1166  {
1167  return omnisci_column->columnType.get_dimension() == 9;
1168 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:331
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1170 of file LazyParquetChunkLoader.cpp.

1171  {
1172  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::NANOS;
1173 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_list_column ( const parquet::ColumnDescriptor *  parquet_column)

Detect a valid list parquet column.

Parameters
parquet_column- the parquet column descriptor of the column to detect
Returns
true if it is a valid parquet list column

Note: the notion of a valid parquet list column is adapted from the parquet schema specification for logical type definitions:

<list-repetition> group <name> (LIST) { repeated group list { <element-repetition> <element-type> element; } }

Testing has shown that there are small deviations from this specification in at least one library– pyarrow– where the innermost schema node is named "item" as opposed to "element".

The following is also true of the schema definition.

  • The outer-most level must be a group annotated with LIST that contains a single field named list. The repetition of this level must be either optional or required and determines whether the list is nullable.
  • The middle level, named list, must be a repeated group with a single field named element.
  • The element field encodes the list's element type and repetition. Element repetition must be required or optional.

FSI further restricts lists to be defined only at the top level, meaning directly below the root schema node.

Definition at line 92 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_array_encoder(), validate_array_mapping(), validate_column_mapping_and_row_group_metadata(), validate_definition_levels(), and validate_max_repetition_and_definition_level().

92  {
93  const parquet::schema::Node* node = parquet_column->schema_node().get();
94  if ((node->name() != "element" && node->name() != "item") ||
95  !(node->is_required() ||
96  node->is_optional())) { // ensure first innermost node is named "element"
97  // which is required by the parquet specification;
98  // however testing shows that pyarrow generates this
99  // column with the name of "item"
100  // this field must be either required or optional
101  return false;
102  }
103  node = node->parent();
104  if (!node) { // required nested structure
105  return false;
106  }
107  if (node->name() != "list" || !node->is_repeated() ||
108  !node->is_group()) { // ensure second innermost node is named "list" which is
109  // a repeated group; this is
110  // required by the parquet specification
111  return false;
112  }
113  node = node->parent();
114  if (!node) { // required nested structure
115  return false;
116  }
117  if (!node->logical_type()->is_list() ||
118  !(node->is_optional() ||
119  node->is_required())) { // ensure third outermost node has logical type LIST
120  // which is either optional or required; this is required
121  // by the parquet specification
122  return false;
123  }
124  node =
125  node->parent(); // this must now be the root node of schema which is required by
126  // FSI (lists can not be embedded into a deeper nested structure)
127  if (!node) { // required nested structure
128  return false;
129  }
130  node = node->parent();
131  if (node) { // implies the previous node was not the root node
132  return false;
133  }
134  return true;
135 }

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_string ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 50 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_geospatial_encoder(), create_parquet_string_encoder(), validate_geospatial_mapping(), and validate_string_mapping().

50  {
51  return (parquet_column->logical_type()->is_none() &&
52  parquet_column->physical_type() == parquet::Type::BYTE_ARRAY) ||
53  parquet_column->logical_type()->is_string();
54 }

+ Here is the caller graph for this function:

std::list<RowGroupMetadata> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::metadata_scan_rowgroup_interval ( const std::map< int, std::shared_ptr< ParquetEncoder >> &  encoder_map,
const RowGroupInterval &  row_group_interval,
const ReaderPtr &  reader,
const ForeignTableSchema &  schema 
)

Definition at line 1446 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnId, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getLogicalAndPhysicalColumns(), foreign_storage::ForeignTableSchema::getLogicalColumn(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::RowGroupInterval::start_index.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1450  {
1451  std::list<RowGroupMetadata> row_group_metadata;
1452  auto column_interval =
1453  Interval<ColumnType>{schema.getLogicalAndPhysicalColumns().front()->columnId,
1454  schema.getLogicalAndPhysicalColumns().back()->columnId};
1455 
1456  auto file_metadata = reader->parquet_reader()->metadata();
1457  for (int row_group = row_group_interval.start_index;
1458  row_group <= row_group_interval.end_index;
1459  ++row_group) {
1460  auto& row_group_metadata_item = row_group_metadata.emplace_back();
1461  row_group_metadata_item.row_group_index = row_group;
1462  row_group_metadata_item.file_path = row_group_interval.file_path;
1463 
1464  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1465  file_metadata->RowGroup(row_group);
1466 
1467  for (int column_id = column_interval.start; column_id <= column_interval.end;
1468  column_id++) {
1469  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1470  auto parquet_column_index = schema.getParquetColumnIndex(column_id);
1471  auto encoder_map_iter =
1472  encoder_map.find(schema.getLogicalColumn(column_id)->columnId);
1473  CHECK(encoder_map_iter != encoder_map.end());
1474  try {
1475  auto metadata = encoder_map_iter->second->getRowGroupMetadata(
1476  group_metadata.get(), parquet_column_index, column_descriptor->columnType);
1477  row_group_metadata_item.column_chunk_metadata.emplace_back(metadata);
1478  } catch (const std::exception& e) {
1479  std::stringstream error_message;
1480  error_message << e.what() << " in row group " << row_group << " of Parquet file '"
1481  << row_group_interval.file_path << "'.";
1482  throw std::runtime_error(error_message.str());
1483  }
1484  }
1485  }
1486  return row_group_metadata;
1487 }
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_import ( const std::map< int, Chunk_NS::Chunk chunks,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const std::map< int, StringDictionary * >  column_dictionaries 
)

Definition at line 1489 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_import(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and i.

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups().

1493  {
1494  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1495  auto file_metadata = reader->parquet_reader()->metadata();
1496  for (auto& [column_id, chunk] : chunks) {
1497  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1498  if (column_descriptor->isGeoPhyCol) { // skip physical columns
1499  continue;
1500  }
1501  auto parquet_column_descriptor =
1502  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1503  auto find_it = column_dictionaries.find(column_id);
1504  StringDictionary* dictionary =
1505  (find_it == column_dictionaries.end() ? nullptr : find_it->second);
1506  std::list<Chunk_NS::Chunk> chunks_for_import;
1507  chunks_for_import.push_back(chunk);
1508  if (column_descriptor->columnType.is_geometry()) {
1509  for (int i = 0; i < column_descriptor->columnType.get_physical_cols(); ++i) {
1510  chunks_for_import.push_back(chunks.at(column_id + i + 1));
1511  }
1512  }
1513  encoder_map[column_id] = create_parquet_encoder_for_import(
1514  chunks_for_import, column_descriptor, parquet_column_descriptor, dictionary);
1515  }
1516  return encoder_map;
1517 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_import(std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_metadata_scan ( const Interval< ColumnType > &  column_interval,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader 
)

Definition at line 1519 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_metadata_scan(), foreign_storage::Interval< T >::end, foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::Interval< T >::start.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1522  {
1523  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1524  auto file_metadata = reader->parquet_reader()->metadata();
1525  for (int column_id = column_interval.start; column_id <= column_interval.end;
1526  column_id++) {
1527  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1528  auto parquet_column_descriptor =
1529  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1530  encoder_map[column_id] = create_parquet_encoder_for_metadata_scan(
1531  column_descriptor, parquet_column_descriptor);
1532  column_id += column_descriptor->columnType.get_physical_cols();
1533  }
1534  return encoder_map;
1535 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_metadata_scan(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
T const end
Definition: Intervals.h:68

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::resize_values_buffer ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::vector< int8_t > &  values 
)

Definition at line 1098 of file LazyParquetChunkLoader.cpp.

References foreign_storage::LazyParquetChunkLoader::batch_reader_num_elements, ColumnDescriptor::columnType, and SQLTypeInfo::get_size().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1100  {
1101  auto max_type_byte_size =
1102  std::max(omnisci_column->columnType.get_size(),
1103  parquet::GetTypeByteSize(parquet_column->physical_type()));
1104  size_t values_size =
1105  LazyParquetChunkLoader::batch_reader_num_elements * max_type_byte_size;
1106  values.resize(values_size);
1107 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_missing_metadata_error ( const int  row_group_index,
const int  column_index,
const std::string &  file_path 
)

Definition at line 1360 of file LazyParquetChunkLoader.cpp.

References to_string().

Referenced by validate_column_mapping_and_row_group_metadata().

1362  {
1363  throw std::runtime_error{
1364  "Statistics metadata is required for all row groups. Metadata is missing for "
1365  "row group index: " +
1366  std::to_string(row_group_index) +
1367  ", column index: " + std::to_string(column_index) + ", file path: " + file_path};
1368 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_row_group_larger_than_fragment_size_error ( const int  row_group_index,
const int64_t  max_row_group_size,
const int  fragment_size,
const std::string &  file_path 
)

Definition at line 1370 of file LazyParquetChunkLoader.cpp.

References to_string().

Referenced by validate_column_mapping_and_row_group_metadata().

1373  {
1374  throw std::runtime_error{
1375  "Parquet file has a row group size that is larger than the fragment size. "
1376  "Please set the table fragment size to a number that is larger than the "
1377  "row group size. Row group index: " +
1378  std::to_string(row_group_index) +
1379  ", row group size: " + std::to_string(max_row_group_size) +
1380  ", fragment size: " + std::to_string(fragment_size) + ", file path: " + file_path};
1381 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping ( const parquet::ColumnDescriptor *  parquet_column,
const ColumnDescriptor omnisci_column 
)

Definition at line 1319 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_type_name(), foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported(), LOG, run_benchmark_import::type, and logger::WARNING.

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_column_mapping_and_row_group_metadata().

1320  {
1321  parquet::Type::type physical_type = parquet_column->physical_type();
1322  auto logical_type = parquet_column->logical_type();
1323  bool allowed_type =
1324  LazyParquetChunkLoader::isColumnMappingSupported(omnisci_column, parquet_column);
1325  if (!allowed_type) {
1326  if (logical_type->is_timestamp()) {
1327  auto timestamp_type =
1328  dynamic_cast<const parquet::TimestampLogicalType*>(logical_type.get());
1329  CHECK(timestamp_type);
1330 
1331  if (!timestamp_type->is_adjusted_to_utc()) {
1332  LOG(WARNING) << "Non-UTC timezone specified in Parquet file for column \""
1333  << omnisci_column->columnName
1334  << "\". Only UTC timezone is currently supported.";
1335  }
1336  }
1337  std::string parquet_type;
1338  if (parquet_column->logical_type()->is_none()) {
1339  parquet_type = parquet::TypeToString(physical_type);
1340  } else {
1341  parquet_type = logical_type->ToString();
1342  }
1343  std::string omnisci_type = omnisci_column->columnType.get_type_name();
1344  throw std::runtime_error{"Conversion from Parquet type \"" + parquet_type +
1345  "\" to OmniSci type \"" + omnisci_type +
1346  "\" is not allowed. Please use an appropriate column type."};
1347  }
1348 }
#define LOG(tag)
Definition: Logger.h:203
std::string get_type_name() const
Definition: sqltypes.h:432
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1276 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::is_array(), is_valid_parquet_list_column(), and foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1277  {
1278  if (is_valid_parquet_list_column(parquet_column) &&
1279  omnisci_column->columnType.is_array()) {
1280  auto omnisci_column_sub_type_column = get_sub_type_column_descriptor(omnisci_column);
1281  return LazyParquetChunkLoader::isColumnMappingSupported(
1282  omnisci_column_sub_type_column.get(), parquet_column);
1283  }
1284  return false;
1285 }
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
SQLTypeInfo columnType
bool is_array() const
Definition: sqltypes.h:517

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_column_mapping_and_row_group_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1383 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::getForeignTable(), foreign_storage::ForeignTableSchema::getLogicalColumns(), i, is_valid_parquet_list_column(), TableDescriptor::maxFragRows, throw_missing_metadata_error(), throw_row_group_larger_than_fragment_size_error(), and validate_allowed_mapping().

Referenced by validate_parquet_metadata().

1386  {
1387  auto column_it = schema.getLogicalColumns().begin();
1388  for (int i = 0; i < file_metadata->num_columns(); ++i, ++column_it) {
1389  const parquet::ColumnDescriptor* descr = file_metadata->schema()->Column(i);
1390  try {
1391  validate_allowed_mapping(descr, *column_it);
1392  } catch (std::runtime_error& e) {
1393  std::stringstream error_message;
1394  error_message << e.what() << " Parquet column: " << descr->name()
1395  << ", OmniSci column: " << (*column_it)->columnName
1396  << ", Parquet file: " << file_path << ".";
1397  throw std::runtime_error(error_message.str());
1398  }
1399 
1400  auto fragment_size = schema.getForeignTable()->maxFragRows;
1401  int64_t max_row_group_size = 0;
1402  int max_row_group_index = 0;
1403  for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
1404  auto group_metadata = file_metadata->RowGroup(r);
1405  auto num_rows = group_metadata->num_rows();
1406  if (num_rows > max_row_group_size) {
1407  max_row_group_size = num_rows;
1408  max_row_group_index = r;
1409  }
1410 
1411  auto column_chunk = group_metadata->ColumnChunk(i);
1412  bool contains_metadata = column_chunk->is_stats_set();
1413  if (contains_metadata) {
1414  auto stats = column_chunk->statistics();
1415  bool is_all_nulls = stats->null_count() == column_chunk->num_values();
1416  bool is_list = is_valid_parquet_list_column(file_metadata->schema()->Column(i));
1417  // Given a list, it is possible it has no min or max if it is comprised
1418  // only of empty lists & nulls. This can not be detected by comparing
1419  // the null count; therefore we afford list types the benefit of the
1420  // doubt in this situation.
1421  if (!(stats->HasMinMax() || is_all_nulls || is_list)) {
1422  contains_metadata = false;
1423  }
1424  }
1425 
1426  if (!contains_metadata) {
1427  throw_missing_metadata_error(r, i, file_path);
1428  }
1429  }
1430 
1431  if (max_row_group_size > fragment_size) {
1433  max_row_group_index, max_row_group_size, fragment_size, file_path);
1434  }
1435  }
1436 }
void throw_row_group_larger_than_fragment_size_error(const int row_group_index, const int64_t max_row_group_size, const int fragment_size, const std::string &file_path)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
void throw_missing_metadata_error(const int row_group_index, const int column_index, const std::string &file_path)
void validate_allowed_mapping(const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_date_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1251 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kDATE, kENCODING_DATE_IN_DAYS, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1252  {
1253  if (!(omnisci_column->columnType.get_type() == kDATE &&
1254  ((omnisci_column->columnType.get_compression() == kENCODING_DATE_IN_DAYS &&
1255  (omnisci_column->columnType.get_comp_param() ==
1256  0 // DATE ENCODING DAYS (32) specifies comp_param of 0
1257  || omnisci_column->columnType.get_comp_param() == 16)) ||
1258  omnisci_column->columnType.get_compression() ==
1259  kENCODING_NONE // for array types
1260  ))) {
1261  return false;
1262  }
1263  return parquet_column->logical_type()->is_date() ||
1264  parquet_column->logical_type()
1265  ->is_timestamp(); // to support TIMESTAMP -> DATE coercion
1266 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
Definition: sqltypes.h:53
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_decimal_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1109 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_precision(), SQLTypeInfo::get_scale(), SQLTypeInfo::is_decimal(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1110  {
1111  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1112  parquet_column->logical_type().get())) {
1113  return omnisci_column->columnType.get_precision() ==
1114  decimal_logical_column->precision() &&
1115  omnisci_column->columnType.get_scale() == decimal_logical_column->scale() &&
1116  omnisci_column->columnType.is_decimal() &&
1117  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1118  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1119  }
1120  return false;
1121 }
HOST DEVICE int get_scale() const
Definition: sqltypes.h:334
int get_precision() const
Definition: sqltypes.h:332
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
SQLTypeInfo columnType
bool is_decimal() const
Definition: sqltypes.h:512

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_definition_levels ( const parquet::ParquetFileReader *  reader,
const int  row_group_index,
const int  column_index,
const int16_t *  def_levels,
const int64_t  num_levels,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1032 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_list_column(), and foreign_storage::validate_and_get_column_metadata_statistics().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1038  {
1039  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1040  if (!is_valid_parquet_list) {
1041  return;
1042  }
1043  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1044  reader->metadata()->RowGroup(row_group_index);
1045  auto column_metadata = group_metadata->ColumnChunk(column_index);
1046  auto stats = validate_and_get_column_metadata_statistics(column_metadata.get());
1047  if (!stats->HasMinMax()) {
1048  auto find_it = std::find_if(def_levels,
1049  def_levels + num_levels,
1050  [](const int16_t def_level) { return def_level == 3; });
1051  if (find_it != def_levels + num_levels) {
1052  throw std::runtime_error(
1053  "No minimum and maximum statistic set in list column but non-null & non-empty "
1054  "array/value detected.");
1055  }
1056  }
1057 }
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema ( const parquet::arrow::FileReader *  reference_file_reader,
const parquet::arrow::FileReader *  new_file_reader,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 1293 of file LazyParquetChunkLoader.cpp.

References foreign_storage::get_column_descriptor(), i, to_string(), and foreign_storage::validate_equal_column_descriptor().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1296  {
1297  const auto reference_num_columns =
1298  reference_file_reader->parquet_reader()->metadata()->num_columns();
1299  const auto new_num_columns =
1300  new_file_reader->parquet_reader()->metadata()->num_columns();
1301  if (reference_num_columns != new_num_columns) {
1302  throw std::runtime_error{"Parquet file \"" + new_file_path +
1303  "\" has a different schema. Please ensure that all Parquet "
1304  "files use the same schema. Reference Parquet file: \"" +
1305  reference_file_path + "\" has " +
1306  std::to_string(reference_num_columns) +
1307  " columns. New Parquet file \"" + new_file_path + "\" has " +
1308  std::to_string(new_num_columns) + " columns."};
1309  }
1310 
1311  for (int i = 0; i < reference_num_columns; i++) {
1313  get_column_descriptor(new_file_reader, i),
1314  reference_file_path,
1315  new_file_path);
1316  }
1317 }
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
std::string to_string(char const *&&v)
const ColumnDescriptor * get_column_descriptor(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:189

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_floating_point_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1123 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, DOUBLE, FLOAT, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), SQLTypeInfo::is_fp(), kENCODING_NONE, and kFLOAT.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1124  {
1125  if (!omnisci_column->columnType.is_fp()) {
1126  return false;
1127  }
1128  // check if mapping is a valid coerced or non-coerced floating point mapping
1129  // with no annotation (floating point columns have no annotation in the
1130  // Parquet specification)
1131  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1132  return (parquet_column->physical_type() == parquet::Type::DOUBLE) ||
1133  (parquet_column->physical_type() == parquet::Type::FLOAT &&
1134  omnisci_column->columnType.get_type() == kFLOAT);
1135  }
1136  return false;
1137 }
bool is_fp() const
Definition: sqltypes.h:513
#define DOUBLE
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
SQLTypeInfo columnType
#define FLOAT

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_geospatial_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1287 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_geometry(), and is_valid_parquet_string().

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1288  {
1289  return is_valid_parquet_string(parquet_column) &&
1290  omnisci_column->columnType.is_geometry();
1291 }
bool is_geometry() const
Definition: sqltypes.h:521
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_integral_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1139 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_size(), SQLTypeInfo::is_integer(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1140  {
1141  if (!omnisci_column->columnType.is_integer()) {
1142  return false;
1143  }
1144  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1145  parquet_column->logical_type().get())) {
1146  CHECK(omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1147  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1148  const int bits_per_byte = 8;
1149  // unsigned types are permitted to map to a wider integral type in order to avoid
1150  // precision loss
1151  const int bit_widening_factor = int_logical_column->is_signed() ? 1 : 2;
1152  return omnisci_column->columnType.get_size() * bits_per_byte <=
1153  int_logical_column->bit_width() * bit_widening_factor;
1154  }
1155  // check if mapping is a valid coerced or non-coerced integral mapping with no
1156  // annotation
1157  if ((omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1158  omnisci_column->columnType.get_compression() == kENCODING_FIXED)) {
1159  return (parquet_column->physical_type() == parquet::Type::INT64) ||
1160  (parquet_column->physical_type() == parquet::Type::INT32 &&
1161  omnisci_column->columnType.get_size() <= 4);
1162  }
1163  return false;
1164 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
bool is_integer() const
Definition: sqltypes.h:511
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_max_repetition_and_definition_level ( const ColumnDescriptor omnisci_column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1059 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::is_array(), is_valid_parquet_list_column(), and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::LazyParquetChunkLoader::loadRowGroups().

1061  {
1062  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1063  if (is_valid_parquet_list && !omnisci_column_descriptor->columnType.is_array()) {
1064  throw std::runtime_error(
1065  "Unsupported mapping detected. Column '" + parquet_column_descriptor->name() +
1066  "' detected to be a parquet list but OmniSci mapped column '" +
1067  omnisci_column_descriptor->columnName + "' is not an array.");
1068  }
1069  if (is_valid_parquet_list) {
1070  if (parquet_column_descriptor->max_repetition_level() != 1 ||
1071  parquet_column_descriptor->max_definition_level() != 3) {
1072  throw std::runtime_error(
1073  "Incorrect schema max repetition level detected in column '" +
1074  parquet_column_descriptor->name() +
1075  "'. Expected a max repetition level of 1 and max definition level of 3 for "
1076  "list column but column has a max "
1077  "repetition level of " +
1078  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1079  " and a max definition level of " +
1080  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1081  }
1082  } else {
1083  if (parquet_column_descriptor->max_repetition_level() != 0 ||
1084  parquet_column_descriptor->max_definition_level() != 1) {
1085  throw std::runtime_error(
1086  "Incorrect schema max repetition level detected in column '" +
1087  parquet_column_descriptor->name() +
1088  "'. Expected a max repetition level of 0 and max definition level of 1 for "
1089  "flat column but column has a max "
1090  "repetition level of " +
1091  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1092  " and a max definition level of " +
1093  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1094  }
1095  }
1096 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::string to_string(char const *&&v)
SQLTypeInfo columnType
std::string columnName
bool is_array() const
Definition: sqltypes.h:517

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_none_type_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1193 of file LazyParquetChunkLoader.cpp.

References BOOLEAN, ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kBOOLEAN, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1194  {
1195  bool is_none_encoded_mapping =
1196  omnisci_column->columnType.get_compression() == kENCODING_NONE &&
1197  (parquet_column->physical_type() == parquet::Type::BOOLEAN &&
1198  omnisci_column->columnType.get_type() == kBOOLEAN);
1199  return parquet_column->logical_type()->is_none() && is_none_encoded_mapping;
1200 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
SQLTypeInfo columnType
#define BOOLEAN

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1350 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::numLogicalColumns(), and foreign_storage::throw_number_of_columns_mismatch_error().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_parquet_metadata().

1353  {
1354  if (schema.numLogicalColumns() != file_metadata->num_columns()) {
1356  schema.numLogicalColumns(), file_metadata->num_columns(), file_path);
1357  }
1358 }
void throw_number_of_columns_mismatch_error(size_t num_table_cols, size_t num_file_cols, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_parquet_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1438 of file LazyParquetChunkLoader.cpp.

References validate_column_mapping_and_row_group_metadata(), and validate_number_of_columns().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1441  {
1442  validate_number_of_columns(file_metadata, file_path, schema);
1443  validate_column_mapping_and_row_group_metadata(file_metadata, file_path, schema);
1444 }
void validate_number_of_columns(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
void validate_column_mapping_and_row_group_metadata(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_string_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1268 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1269  {
1270  return is_valid_parquet_string(parquet_column) &&
1271  omnisci_column->columnType.is_string() &&
1272  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1273  omnisci_column->columnType.get_compression() == kENCODING_DICT);
1274 }
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:509

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_time_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1237 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kENCODING_FIXED, kENCODING_NONE, and kTIME.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1238  {
1239  if (!(omnisci_column->columnType.get_type() == kTIME &&
1240  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1241  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1242  omnisci_column->columnType.get_comp_param() == 32)))) {
1243  return false;
1244  }
1245  if (parquet_column->logical_type()->is_time()) {
1246  return true;
1247  }
1248  return false;
1249 }
Definition: sqltypes.h:49
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_timestamp_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1202 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_dimension(), SQLTypeInfo::get_type(), is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_FIXED, kENCODING_NONE, and kTIMESTAMP.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1203  {
1204  if (!(omnisci_column->columnType.get_type() == kTIMESTAMP &&
1205  ((omnisci_column->columnType.get_compression() == kENCODING_NONE) ||
1206  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1207  omnisci_column->columnType.get_comp_param() == 32)))) {
1208  return false;
1209  }
1210  // check the annotated case
1211  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1212  parquet_column->logical_type().get())) {
1213  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1214  return omnisci_column->columnType.get_dimension() == 0 ||
1215  ((is_nanosecond_precision(omnisci_column) &&
1216  is_nanosecond_precision(timestamp_logical_column)) ||
1217  (is_microsecond_precision(omnisci_column) &&
1218  is_microsecond_precision(timestamp_logical_column)) ||
1219  (is_millisecond_precision(omnisci_column) &&
1220  is_millisecond_precision(timestamp_logical_column)));
1221  }
1222  if (omnisci_column->columnType.get_compression() == kENCODING_FIXED) {
1223  return omnisci_column->columnType.get_dimension() == 0;
1224  }
1225  }
1226  // check the unannotated case
1227  if (parquet_column->logical_type()->is_none() &&
1228  ((parquet_column->physical_type() == parquet::Type::INT32 &&
1229  omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1230  omnisci_column->columnType.get_comp_param() == 32) ||
1231  parquet_column->physical_type() == parquet::Type::INT64)) {
1232  return true;
1233  }
1234  return false;
1235 }
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:331
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function: