OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp} Namespace Reference

Classes

struct  MaxRowGroupSizeStats
 

Functions

bool within_range (int64_t lower_bound, int64_t upper_bound, int64_t value)
 
bool is_valid_parquet_string (const parquet::ColumnDescriptor *parquet_column)
 
bool is_valid_parquet_list_column (const parquet::ColumnDescriptor *parquet_column)
 Detect a valid list parquet column. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder_with_omnisci_type (const ColumnDescriptor *column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename U , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_signed_or_unsigned_integral_encoder_with_types (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
 Create a signed or unsigned integral parquet encoder using types. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder_with_omnisci_type (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const int bit_width, const bool is_signed)
 Create a integral parquet encoder using types. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_floating_point_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_none_type_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_string_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import, const bool is_for_detect)
 
std::shared_ptr< ParquetEncodercreate_parquet_geospatial_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool geo_validate_geometry)
 
std::shared_ptr< ParquetEncodercreate_parquet_array_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
 Create a Parquet specific encoder for a Parquet to OmniSci mapping. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_import (std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary, const bool geo_validate_geometry)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_metadata_scan (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const bool geo_validate_geometry)
 
void validate_list_column_metadata_statistics (const parquet::ParquetFileReader *reader, const int row_group_index, const int column_index, const int16_t *def_levels, const int64_t num_levels, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void set_definition_levels_for_zero_max_definition_level_case (const parquet::ColumnDescriptor *parquet_column_descriptor, std::vector< int16_t > &def_levels)
 
void validate_max_repetition_and_definition_level (const ColumnDescriptor *omnisci_column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void resize_values_buffer (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::vector< int8_t > &values)
 
bool validate_decimal_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_decimal_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_floating_point_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_floating_point_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_integral_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_integral_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool is_nanosecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_nanosecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_microsecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_microsecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_millisecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_millisecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool validate_none_type_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_boolean_type_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_timestamp_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_timestamp_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_time_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_time_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_date_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_date_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_string_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_string_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_geospatial_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
void validate_equal_schema (const parquet::arrow::FileReader *reference_file_reader, const parquet::arrow::FileReader *new_file_reader, const std::string &reference_file_path, const std::string &new_file_path)
 
void validate_allowed_mapping (const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)
 
SQLTypeInfo suggest_column_scalar_type (const parquet::ColumnDescriptor *parquet_column)
 
void validate_number_of_columns (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
void throw_missing_metadata_error (const int row_group_index, const int column_index, const std::string &file_path)
 
void throw_row_group_larger_than_fragment_size_error (const MaxRowGroupSizeStats max_row_group_stats, const int fragment_size)
 
MaxRowGroupSizeStats validate_column_mapping_and_row_group_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema, const bool do_metadata_stats_validation)
 
MaxRowGroupSizeStats validate_parquet_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema, const bool do_metadata_stats_validation)
 
std::list< RowGroupMetadatametadata_scan_rowgroup_interval (const std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const RowGroupInterval &row_group_interval, const ReaderPtr &reader, const ForeignTableSchema &schema)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_import (const std::map< int, Chunk_NS::Chunk > chunks, const ForeignTableSchema &schema, const ReaderPtr &reader, const std::map< int, StringDictionary * > column_dictionaries, const int64_t num_rows, const bool geo_validate_geometry)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_metadata_scan (const Interval< ColumnType > &column_interval, const ForeignTableSchema &schema, const ReaderPtr &reader, const bool do_metadata_stats_validation, const bool geo_validate_geometry)
 

Function Documentation

std::shared_ptr< ParquetEncoder > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import,
const bool  is_for_detect,
const bool  geo_validate_geometry 
)

Definition at line 1024 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_encoder(), foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::is_array(), SQLTypeInfo::is_fixlen_array(), and is_valid_parquet_list_column().

Referenced by create_parquet_encoder().

1033  {
1034  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column);
1035  if (!is_valid_parquet_list || !omnisci_column->columnType.is_array()) {
1036  return {};
1037  }
1038  std::unique_ptr<ColumnDescriptor> omnisci_column_sub_type_column =
1039  get_sub_type_column_descriptor(omnisci_column);
1040  auto encoder = create_parquet_encoder(omnisci_column_sub_type_column.get(),
1041  parquet_column,
1042  chunks,
1043  string_dictionary,
1044  chunk_metadata,
1045  is_metadata_scan,
1046  is_for_import,
1047  is_for_detect,
1048  geo_validate_geometry);
1049  CHECK(encoder.get());
1050  auto scalar_encoder = std::dynamic_pointer_cast<ParquetScalarEncoder>(encoder);
1051  CHECK(scalar_encoder);
1052  if (!is_for_import) {
1053  if (!is_for_detect) {
1054  if (omnisci_column->columnType.is_fixlen_array()) {
1055  encoder = std::make_shared<ParquetFixedLengthArrayEncoder>(
1056  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1057  scalar_encoder,
1058  omnisci_column);
1059  } else {
1060  encoder = std::make_shared<ParquetVariableLengthArrayEncoder>(
1061  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1062  is_metadata_scan ? nullptr : chunks.begin()->getIndexBuf(),
1063  scalar_encoder,
1064  omnisci_column);
1065  }
1066  } else { // is_for_detect
1067  encoder = std::make_shared<ParquetArrayDetectEncoder>(
1068  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1069  }
1070  } else { // is_for_import
1071  encoder = std::make_shared<ParquetArrayImportEncoder>(
1072  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1073  }
1074  return encoder;
1075 }
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.
bool is_fixlen_array() const
Definition: sqltypes.h:589
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType
bool is_array() const
Definition: sqltypes.h:583

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 736 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, kENCODING_DATE_IN_DAYS, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

740  {
741  auto column_type = omnisci_column->columnType;
742  if (parquet_column->logical_type()->is_date() && column_type.is_date()) {
743  if (column_type.get_compression() == kENCODING_DATE_IN_DAYS) {
744  if (is_metadata_scan_or_for_import) {
745  if (column_type.get_comp_param() ==
746  0) { // DATE ENCODING FIXED (32) uses comp param 0
747  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int32_t>>(
748  buffer);
749  } else if (column_type.get_comp_param() == 16) {
750  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int16_t>>(
751  buffer);
752  } else {
753  UNREACHABLE();
754  }
755  } else {
756  if (column_type.get_comp_param() ==
757  0) { // DATE ENCODING FIXED (32) uses comp param 0
758  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t>>(
759  buffer, omnisci_column, parquet_column);
760  } else if (column_type.get_comp_param() == 16) {
761  return std::make_shared<ParquetFixedLengthEncoder<int16_t, int32_t>>(
762  buffer, omnisci_column, parquet_column);
763  } else {
764  UNREACHABLE();
765  }
766  }
767  } else if (column_type.get_compression() == kENCODING_NONE) { // for array types
768  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int64_t>>(
769  buffer, omnisci_column, parquet_column);
770  } else {
771  UNREACHABLE();
772  }
773  }
774  return {};
775 }
#define UNREACHABLE()
Definition: Logger.h:338
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 693 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_date_from_timestamp_encoder_with_types(), kENCODING_DATE_IN_DAYS, and UNREACHABLE.

Referenced by create_parquet_encoder().

697  {
698  auto column_type = omnisci_column->columnType;
699  if (parquet_column->logical_type()->is_timestamp() && column_type.is_date()) {
700  CHECK(column_type.get_compression() == kENCODING_DATE_IN_DAYS);
701  if (is_metadata_scan_or_for_import) {
702  if (column_type.get_comp_param() ==
703  0) { // DATE ENCODING FIXED (32) uses comp param 0
705  int64_t,
706  int32_t>(
707  omnisci_column, parquet_column, buffer, true);
708  } else if (column_type.get_comp_param() == 16) {
710  int64_t,
711  int16_t>(
712  omnisci_column, parquet_column, buffer, true);
713  } else {
714  UNREACHABLE();
715  }
716  } else {
717  if (column_type.get_comp_param() ==
718  0) { // DATE ENCODING FIXED (32) uses comp param 0
720  int64_t,
721  int32_t>(
722  omnisci_column, parquet_column, buffer, false);
723  } else if (column_type.get_comp_param() == 16) {
725  int64_t,
726  int16_t>(
727  omnisci_column, parquet_column, buffer, false);
728  } else {
729  UNREACHABLE();
730  }
731  }
732  }
733  return {};
734 }
#define UNREACHABLE()
Definition: Logger.h:338
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder_with_types(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 508 of file LazyParquetChunkLoader.cpp.

References heavydb.dtypes::T, and UNREACHABLE.

Referenced by create_parquet_date_from_timestamp_encoder().

512  {
513  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
514  parquet_column->logical_type().get())) {
515  switch (timestamp_logical_type->time_unit()) {
516  case parquet::LogicalType::TimeUnit::MILLIS:
517  if (is_metadata_scan_or_for_import) {
518  return std::make_shared<
519  ParquetDateInSecondsFromTimestampEncoder<V, T, 1000L, NullType>>(
520  buffer, omnisci_column, parquet_column);
521  }
522  return std::make_shared<
523  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L, NullType>>(
524  buffer, omnisci_column, parquet_column);
525  case parquet::LogicalType::TimeUnit::MICROS:
526  if (is_metadata_scan_or_for_import) {
527  return std::make_shared<
528  ParquetDateInSecondsFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
529  buffer, omnisci_column, parquet_column);
530  }
531  return std::make_shared<
532  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
533  buffer, omnisci_column, parquet_column);
534  case parquet::LogicalType::TimeUnit::NANOS:
535  if (is_metadata_scan_or_for_import) {
536  return std::make_shared<
538  T,
539  1000L * 1000L * 1000L,
540  NullType>>(
541  buffer, omnisci_column, parquet_column);
542  }
543  return std::make_shared<
544  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
545  buffer, omnisci_column, parquet_column);
546  default:
547  UNREACHABLE();
548  }
549  } else {
550  UNREACHABLE();
551  }
552  return {};
553 }
ParquetTimestampEncoder< V, T, conversion_denominator, NullType > ParquetDateInSecondsFromTimestampEncoder
#define UNREACHABLE()
Definition: Logger.h:338

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 171 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

175  {
176  if (parquet_column->logical_type()->is_decimal()) {
177  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
178  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int64_t>(
179  omnisci_column, parquet_column, buffer);
180  }
181  CHECK(omnisci_column->columnType.get_compression() == kENCODING_FIXED);
182  if (is_metadata_scan_or_for_import) {
183  switch (omnisci_column->columnType.get_comp_param()) {
184  case 16:
185  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int16_t>(
186  omnisci_column, parquet_column, buffer);
187  case 32:
188  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int32_t>(
189  omnisci_column, parquet_column, buffer);
190  default:
191  UNREACHABLE();
192  }
193  } else {
194  switch (omnisci_column->columnType.get_comp_param()) {
195  case 16:
196  return create_parquet_decimal_encoder_with_omnisci_type<int16_t, int16_t>(
197  omnisci_column, parquet_column, buffer);
198  case 32:
199  return create_parquet_decimal_encoder_with_omnisci_type<int32_t, int32_t>(
200  omnisci_column, parquet_column, buffer);
201  default:
202  UNREACHABLE();
203  }
204  }
205  }
206  return {};
207 }
#define UNREACHABLE()
Definition: Logger.h:338
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder_with_omnisci_type ( const ColumnDescriptor column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor,
AbstractBuffer buffer 
)

Definition at line 147 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

150  {
151  switch (parquet_column_descriptor->physical_type()) {
152  case parquet::Type::INT32:
153  return std::make_shared<ParquetDecimalEncoder<V, int32_t, NullType>>(
154  buffer, column_descriptor, parquet_column_descriptor);
155  case parquet::Type::INT64:
156  return std::make_shared<ParquetDecimalEncoder<V, int64_t, NullType>>(
157  buffer, column_descriptor, parquet_column_descriptor);
158  case parquet::Type::FIXED_LEN_BYTE_ARRAY:
159  return std::make_shared<
160  ParquetDecimalEncoder<V, parquet::FixedLenByteArray, NullType>>(
161  buffer, column_descriptor, parquet_column_descriptor);
162  case parquet::Type::BYTE_ARRAY:
163  return std::make_shared<ParquetDecimalEncoder<V, parquet::ByteArray, NullType>>(
164  buffer, column_descriptor, parquet_column_descriptor);
165  default:
166  UNREACHABLE();
167  }
168  return {};
169 }
#define UNREACHABLE()
Definition: Logger.h:338
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import,
const bool  is_for_detect,
const bool  geo_validate_geometry 
)

Create a Parquet specific encoder for a Parquet to OmniSci mapping.

Parameters
omnisci_column- the descriptor of OmniSci column
parquet_column- the descriptor of Parquet column
chunks- list of chunks to populate (the case of more than one chunk happens only if a logical column expands to multiple physical columns)
string_dictionary- string dictionary used in encoding for string dictionary encoded columns
chunk_metadata- similar to the list of chunks, a list of chunk metadata that is populated
is_metadata_scan- a flag indicating if the encoders created should be for a metadata scan
is_for_import- a flag indicating if the encoders created should be for import
Returns
An appropriate Parquet encoder for the use case defined by the Parquet to OmniSci mapping.

Notes:

  • In the case of a metadata scan, the type of the encoder created may significantly change (for example in bit width.) This is because it is common for OmniSci to store metadata in a different format altogether than the data itself (see for example FixedLengthEncoder.)
  • This function and the function isColumnMappingSupported work in conjunction with each other. For example, once a mapping is known to be allowed (since isColumnMappingSupported returned true) this function does not have to check many corner cases exhaustively as it would be redundant with what was checked in isColumnMappingSupported.

Definition at line 904 of file LazyParquetChunkLoader.cpp.

References CHECK, create_parquet_array_encoder(), create_parquet_date_encoder(), create_parquet_date_from_timestamp_encoder(), create_parquet_decimal_encoder(), create_parquet_floating_point_encoder(), create_parquet_geospatial_encoder(), create_parquet_integral_encoder(), create_parquet_none_type_encoder(), create_parquet_string_encoder(), create_parquet_time_encoder(), create_parquet_timestamp_encoder(), and UNREACHABLE.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), create_parquet_array_encoder(), create_parquet_encoder_for_import(), and create_parquet_encoder_for_metadata_scan().

913  {
914  CHECK(!(is_metadata_scan && is_for_import));
915  auto buffer = chunks.empty() ? nullptr : chunks.begin()->getBuffer();
916  if (auto encoder = create_parquet_geospatial_encoder(omnisci_column,
917  parquet_column,
918  chunks,
919  chunk_metadata,
920  is_metadata_scan,
921  is_for_import,
922  geo_validate_geometry)) {
923  return encoder;
924  }
925  if (auto encoder = create_parquet_array_encoder(omnisci_column,
926  parquet_column,
927  chunks,
928  string_dictionary,
929  chunk_metadata,
930  is_metadata_scan,
931  is_for_import,
932  is_for_detect,
933  geo_validate_geometry)) {
934  return encoder;
935  }
936  if (auto encoder = create_parquet_decimal_encoder(
937  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
938  return encoder;
939  }
940  if (auto encoder = create_parquet_integral_encoder(
941  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
942  return encoder;
943  }
944  if (auto encoder =
945  create_parquet_floating_point_encoder(omnisci_column, parquet_column, buffer)) {
946  return encoder;
947  }
948  if (auto encoder = create_parquet_timestamp_encoder(
949  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
950  return encoder;
951  }
952  if (auto encoder =
953  create_parquet_none_type_encoder(omnisci_column, parquet_column, buffer)) {
954  return encoder;
955  }
956  if (auto encoder = create_parquet_time_encoder(
957  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
958  return encoder;
959  }
961  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
962  return encoder;
963  }
964  if (auto encoder = create_parquet_date_encoder(
965  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
966  return encoder;
967  }
968  if (auto encoder = create_parquet_string_encoder(
969  omnisci_column,
970  parquet_column,
971  chunks.empty() ? Chunk_NS::Chunk{} : *chunks.begin(),
972  string_dictionary,
973  chunk_metadata,
974  is_for_import,
975  is_for_detect)) {
976  return encoder;
977  }
978  UNREACHABLE();
979  return {};
980 }
std::shared_ptr< ParquetEncoder > create_parquet_geospatial_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool geo_validate_geometry)
#define UNREACHABLE()
Definition: Logger.h:338
std::shared_ptr< ParquetEncoder > create_parquet_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_none_type_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
std::shared_ptr< ParquetEncoder > create_parquet_array_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
std::shared_ptr< ParquetEncoder > create_parquet_time_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_date_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_string_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import, const bool is_for_detect)
std::shared_ptr< ParquetEncoder > create_parquet_floating_point_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
#define CHECK(condition)
Definition: Logger.h:291
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_decimal_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_integral_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_import ( std::list< Chunk_NS::Chunk > &  chunks,
const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
StringDictionary string_dictionary,
const bool  geo_validate_geometry 
)

Intended to be used for the import case.

Definition at line 985 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_import().

990  {
991  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
992  return create_parquet_encoder(omnisci_column,
993  parquet_column,
994  chunks,
995  string_dictionary,
996  chunk_metadata,
997  false,
998  true,
999  false,
1000  geo_validate_geometry);
1001 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_metadata_scan ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const bool  geo_validate_geometry 
)

Intended to be used only with metadata scan. Creates an incomplete encoder capable of updating metadata.

Definition at line 1007 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_metadata_scan().

1010  {
1011  std::list<Chunk_NS::Chunk> chunks;
1012  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
1013  return create_parquet_encoder(omnisci_column,
1014  parquet_column,
1015  chunks,
1016  nullptr,
1017  chunk_metadata,
1018  true,
1019  false,
1020  false,
1021  geo_validate_geometry);
1022 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_floating_point_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 427 of file LazyParquetChunkLoader.cpp.

References CHECK, CHECK_EQ, ColumnDescriptor::columnType, kDOUBLE, kENCODING_NONE, kFLOAT, and UNREACHABLE.

Referenced by create_parquet_encoder().

430  {
431  auto column_type = omnisci_column->columnType;
432  if (!column_type.is_fp()) {
433  return {};
434  }
435  CHECK_EQ(column_type.get_compression(), kENCODING_NONE);
436  switch (column_type.get_type()) {
437  case kFLOAT:
438  switch (parquet_column->physical_type()) {
439  case parquet::Type::FLOAT:
440  return std::make_shared<ParquetFixedLengthEncoder<float, float>>(
441  buffer, omnisci_column, parquet_column);
442  case parquet::Type::DOUBLE:
443  return std::make_shared<ParquetFixedLengthEncoder<float, double>>(
444  buffer, omnisci_column, parquet_column);
445  default:
446  UNREACHABLE();
447  }
448  case kDOUBLE:
449  CHECK(parquet_column->physical_type() == parquet::Type::DOUBLE);
450  return std::make_shared<ParquetFixedLengthEncoder<double, double>>(
451  buffer, omnisci_column, parquet_column);
452  default:
453  UNREACHABLE();
454  }
455  return {};
456 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_geospatial_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import,
const bool  geo_validate_geometry 
)

Definition at line 830 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and is_valid_parquet_string().

Referenced by create_parquet_encoder().

837  {
838  auto column_type = omnisci_column->columnType;
839  if (!is_valid_parquet_string(parquet_column) || !column_type.is_geometry()) {
840  return {};
841  }
842  if (is_for_import) {
843  return std::make_shared<ParquetGeospatialImportEncoder>(chunks,
844  geo_validate_geometry);
845  }
846  if (is_metadata_scan) {
847  return std::make_shared<ParquetGeospatialEncoder>(geo_validate_geometry);
848  }
849  for (auto chunks_iter = chunks.begin(); chunks_iter != chunks.end(); ++chunks_iter) {
850  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
851  auto& chunk_metadata_ptr = chunk_metadata.back();
852  chunk_metadata_ptr->sqlType = chunks_iter->getColumnDesc()->columnType;
853  }
854  return std::make_shared<ParquetGeospatialEncoder>(
855  parquet_column, chunks, chunk_metadata, geo_validate_geometry);
856 }
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 297 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, and UNREACHABLE.

Referenced by create_parquet_encoder().

301  {
302  auto column_type = omnisci_column->columnType;
303  auto physical_type = parquet_column->physical_type();
304 
305  int bit_width = -1;
306  int is_signed = false;
307  // handle the integral case with no Parquet annotation
308  if (parquet_column->logical_type()->is_none() && column_type.is_integer()) {
309  if (physical_type == parquet::Type::INT32) {
310  bit_width = 32;
311  } else if (physical_type == parquet::Type::INT64) {
312  bit_width = 64;
313  } else {
314  UNREACHABLE();
315  }
316  is_signed = true;
317  }
318  // handle the integral case with Parquet annotation
319  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
320  parquet_column->logical_type().get())) {
321  bit_width = int_logical_column->bit_width();
322  is_signed = int_logical_column->is_signed();
323  }
324 
325  if (bit_width == -1) { // no valid logical type (with or without annotation) found
326  return {};
327  }
328 
329  const size_t omnisci_data_type_byte_size = column_type.get_size();
330  const size_t parquet_data_type_byte_size = parquet::GetTypeByteSize(physical_type);
331 
332  switch (omnisci_data_type_byte_size) {
333  case 8:
334  CHECK(column_type.get_compression() == kENCODING_NONE);
335  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int64_t>(
336  buffer,
337  omnisci_data_type_byte_size,
338  parquet_data_type_byte_size,
339  bit_width,
340  is_signed);
341  case 4:
342  if (is_metadata_scan_or_for_import && column_type.get_type() == kBIGINT) {
343  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int32_t>(
344  buffer,
345  omnisci_data_type_byte_size,
346  parquet_data_type_byte_size,
347  bit_width,
348  is_signed);
349  }
350  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int32_t>(
351  buffer,
352  omnisci_data_type_byte_size,
353  parquet_data_type_byte_size,
354  bit_width,
355  is_signed);
356  case 2:
357  if (is_metadata_scan_or_for_import) {
358  switch (column_type.get_type()) {
359  case kBIGINT:
360  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int16_t>(
361  buffer,
362  omnisci_data_type_byte_size,
363  parquet_data_type_byte_size,
364  bit_width,
365  is_signed);
366  case kINT:
367  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int16_t>(
368  buffer,
369  omnisci_data_type_byte_size,
370  parquet_data_type_byte_size,
371  bit_width,
372  is_signed);
373  case kSMALLINT:
374  break;
375  default:
376  UNREACHABLE();
377  }
378  }
379  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int16_t>(
380  buffer,
381  omnisci_data_type_byte_size,
382  parquet_data_type_byte_size,
383  bit_width,
384  is_signed);
385  case 1:
386  if (is_metadata_scan_or_for_import) {
387  switch (column_type.get_type()) {
388  case kBIGINT:
389  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int8_t>(
390  buffer,
391  omnisci_data_type_byte_size,
392  parquet_data_type_byte_size,
393  bit_width,
394  is_signed);
395  case kINT:
396  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int8_t>(
397  buffer,
398  omnisci_data_type_byte_size,
399  parquet_data_type_byte_size,
400  bit_width,
401  is_signed);
402  case kSMALLINT:
403  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int8_t>(
404  buffer,
405  omnisci_data_type_byte_size,
406  parquet_data_type_byte_size,
407  bit_width,
408  is_signed);
409  case kTINYINT:
410  break;
411  default:
412  UNREACHABLE();
413  }
414  }
415  return create_parquet_integral_encoder_with_omnisci_type<int8_t, int8_t>(
416  buffer,
417  omnisci_data_type_byte_size,
418  parquet_data_type_byte_size,
419  bit_width,
420  is_signed);
421  default:
422  UNREACHABLE();
423  }
424  return {};
425 }
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqltypes.h:72
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder_with_omnisci_type ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const int  bit_width,
const bool  is_signed 
)

Create a integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
bit_width- bit width specified for the Parquet column
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated type V and NullType.

Note, this function determines the appropriate bit depth integral encoder to create, while create_parquet_signed_or_unsigned_integral_encoder_with_types determines whether to create a signed or unsigned integral encoder.

Definition at line 260 of file LazyParquetChunkLoader.cpp.

References create_parquet_signed_or_unsigned_integral_encoder_with_types(), and UNREACHABLE.

265  {
266  switch (bit_width) {
267  case 8:
269  int32_t,
270  uint8_t,
271  NullType>(
272  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
273  case 16:
275  int32_t,
276  uint16_t,
277  NullType>(
278  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
279  case 32:
281  int32_t,
282  uint32_t,
283  NullType>(
284  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
285  case 64:
287  int64_t,
288  uint64_t,
289  NullType>(
290  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
291  default:
292  UNREACHABLE();
293  }
294  return {};
295 }
std::shared_ptr< ParquetEncoder > create_parquet_signed_or_unsigned_integral_encoder_with_types(AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
Create a signed or unsigned integral parquet encoder using types.
#define UNREACHABLE()
Definition: Logger.h:338

+ Here is the call graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_none_type_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 458 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_string(), kBOOLEAN, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

461  {
462  auto column_type = omnisci_column->columnType;
463  if (parquet_column->logical_type()->is_none() &&
464  !omnisci_column->columnType.is_string()) { // boolean
465  if (column_type.get_compression() == kENCODING_NONE) {
466  switch (column_type.get_type()) {
467  case kBOOLEAN:
468  return std::make_shared<ParquetFixedLengthEncoder<int8_t, bool>>(
469  buffer, omnisci_column, parquet_column);
470  default:
471  UNREACHABLE();
472  }
473  } else {
474  UNREACHABLE();
475  }
476  }
477  return {};
478 }
#define UNREACHABLE()
Definition: Logger.h:338
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:559

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename U , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_signed_or_unsigned_integral_encoder_with_types ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const bool  is_signed 
)

Create a signed or unsigned integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated types V, T, U, and NullType.

Definition at line 225 of file LazyParquetChunkLoader.cpp.

References CHECK.

Referenced by create_parquet_integral_encoder_with_omnisci_type().

229  {
230  CHECK(sizeof(NullType) == omnisci_data_type_byte_size);
231  if (is_signed) {
232  return std::make_shared<ParquetFixedLengthEncoder<V, T, NullType>>(
233  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
234  } else {
235  return std::make_shared<ParquetUnsignedFixedLengthEncoder<V, T, U, NullType>>(
236  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
237  }
238 }
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_string_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const Chunk_NS::Chunk chunk,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
bool  is_for_import,
const bool  is_for_detect 
)

Definition at line 777 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, Chunk_NS::Chunk::getBuffer(), Chunk_NS::Chunk::getIndexBuf(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

784  {
785  auto column_type = omnisci_column->columnType;
786  if (!is_valid_parquet_string(parquet_column) ||
787  !omnisci_column->columnType.is_string()) {
788  return {};
789  }
790  if (column_type.get_compression() == kENCODING_NONE) {
791  if (is_for_import) {
792  return std::make_shared<ParquetStringImportEncoder>(chunk.getBuffer());
793  } else {
794  return std::make_shared<ParquetStringNoneEncoder>(chunk.getBuffer(),
795  chunk.getIndexBuf());
796  }
797  } else if (column_type.get_compression() == kENCODING_DICT) {
798  if (!is_for_detect) { // non-detect use case
799  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
800  std::unique_ptr<ChunkMetadata>& logical_chunk_metadata = chunk_metadata.back();
801  logical_chunk_metadata->sqlType = omnisci_column->columnType;
802  switch (column_type.get_size()) {
803  case 1:
804  return std::make_shared<ParquetStringEncoder<uint8_t>>(
805  chunk.getBuffer(),
806  string_dictionary,
807  is_for_import ? nullptr : logical_chunk_metadata.get());
808  case 2:
809  return std::make_shared<ParquetStringEncoder<uint16_t>>(
810  chunk.getBuffer(),
811  string_dictionary,
812  is_for_import ? nullptr : logical_chunk_metadata.get());
813  case 4:
814  return std::make_shared<ParquetStringEncoder<int32_t>>(
815  chunk.getBuffer(),
816  string_dictionary,
817  is_for_import ? nullptr : logical_chunk_metadata.get());
818  default:
819  UNREACHABLE();
820  }
821  } else { // detect use-case
822  return std::make_shared<ParquetDetectStringEncoder>(chunk.getBuffer());
823  }
824  } else {
825  UNREACHABLE();
826  }
827  return {};
828 }
AbstractBuffer * getIndexBuf() const
Definition: Chunk.h:148
#define UNREACHABLE()
Definition: Logger.h:338
AbstractBuffer * getBuffer() const
Definition: Chunk.h:146
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:559

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 640 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

644  {
645  auto column_type = omnisci_column->columnType;
646  if (auto time_logical_column = dynamic_cast<const parquet::TimeLogicalType*>(
647  parquet_column->logical_type().get())) {
648  if (column_type.get_compression() == kENCODING_NONE) {
649  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
650  return create_parquet_time_encoder_with_types<int64_t, int32_t, int64_t>(
651  omnisci_column, parquet_column, buffer);
652  } else {
653  return create_parquet_time_encoder_with_types<int64_t, int64_t, int64_t>(
654  omnisci_column, parquet_column, buffer);
655  }
656  } else if (column_type.get_compression() == kENCODING_FIXED) {
657  if (is_metadata_scan_or_for_import) {
658  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
659  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
660  return create_parquet_time_encoder_with_types<int64_t, int32_t, int32_t>(
661  omnisci_column, parquet_column, buffer);
662  } else {
663  CHECK(time_logical_column->time_unit() ==
664  parquet::LogicalType::TimeUnit::MICROS ||
665  time_logical_column->time_unit() ==
666  parquet::LogicalType::TimeUnit::NANOS);
667  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
668  return create_parquet_time_encoder_with_types<int64_t, int64_t, int32_t>(
669  omnisci_column, parquet_column, buffer);
670  }
671  } else {
672  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
673  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
674  return create_parquet_time_encoder_with_types<int32_t, int32_t, int32_t>(
675  omnisci_column, parquet_column, buffer);
676  } else {
677  CHECK(time_logical_column->time_unit() ==
678  parquet::LogicalType::TimeUnit::MICROS ||
679  time_logical_column->time_unit() ==
680  parquet::LogicalType::TimeUnit::NANOS);
681  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
682  return create_parquet_time_encoder_with_types<int32_t, int64_t, int32_t>(
683  omnisci_column, parquet_column, buffer);
684  }
685  }
686  } else {
687  UNREACHABLE();
688  }
689  }
690  return {};
691 }
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 614 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

617  {
618  if (auto time_logical_type = dynamic_cast<const parquet::TimeLogicalType*>(
619  parquet_column->logical_type().get())) {
620  switch (time_logical_type->time_unit()) {
621  case parquet::LogicalType::TimeUnit::MILLIS:
622  return std::make_shared<ParquetTimeEncoder<V, T, 1000L, NullType>>(
623  buffer, omnisci_column, parquet_column);
624  case parquet::LogicalType::TimeUnit::MICROS:
625  return std::make_shared<ParquetTimeEncoder<V, T, 1000L * 1000L, NullType>>(
626  buffer, omnisci_column, parquet_column);
627  case parquet::LogicalType::TimeUnit::NANOS:
628  return std::make_shared<
629  ParquetTimeEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
630  buffer, omnisci_column, parquet_column);
631  default:
632  UNREACHABLE();
633  }
634  } else {
635  UNREACHABLE();
636  }
637  return {};
638 }
#define UNREACHABLE()
Definition: Logger.h:338
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 555 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_precision(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

559  {
560  auto column_type = omnisci_column->columnType;
561  auto precision = column_type.get_precision();
562  if (parquet_column->logical_type()->is_timestamp()) {
563  if (column_type.get_compression() == kENCODING_NONE) {
564  if (precision == 0) {
565  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int64_t>(
566  omnisci_column, parquet_column, buffer);
567  } else {
568  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
569  buffer, omnisci_column, parquet_column);
570  }
571  } else if (column_type.get_compression() == kENCODING_FIXED) {
572  CHECK(column_type.get_comp_param() == 32);
573  if (is_metadata_scan_or_for_import) {
574  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int32_t>(
575  omnisci_column, parquet_column, buffer);
576  } else {
577  return create_parquet_timestamp_encoder_with_types<int32_t, int64_t, int32_t>(
578  omnisci_column, parquet_column, buffer);
579  }
580  }
581  } else if (parquet_column->logical_type()->is_none() && column_type.is_timestamp()) {
582  if (parquet_column->physical_type() == parquet::Type::INT32) {
583  CHECK(column_type.get_compression() == kENCODING_FIXED &&
584  column_type.get_comp_param() == 32);
585  if (is_metadata_scan_or_for_import) {
586  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int32_t, int32_t>>(
587  buffer, omnisci_column, parquet_column);
588  } else {
589  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t, int32_t>>(
590  buffer, omnisci_column, parquet_column);
591  }
592  } else if (parquet_column->physical_type() == parquet::Type::INT64) {
593  if (column_type.get_compression() == kENCODING_NONE) {
594  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
595  buffer, omnisci_column, parquet_column);
596  } else if (column_type.get_compression() == kENCODING_FIXED) {
597  CHECK(column_type.get_comp_param() == 32);
598  if (is_metadata_scan_or_for_import) {
599  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int32_t>>(
600  buffer, omnisci_column, parquet_column);
601  } else {
602  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int64_t, int32_t>>(
603  buffer, omnisci_column, parquet_column);
604  }
605  }
606  } else {
607  UNREACHABLE();
608  }
609  }
610  return {};
611 }
#define UNREACHABLE()
Definition: Logger.h:338
int get_precision() const
Definition: sqltypes.h:394
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 481 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

484  {
485  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
486  parquet_column->logical_type().get())) {
487  switch (timestamp_logical_type->time_unit()) {
488  case parquet::LogicalType::TimeUnit::MILLIS:
489  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L, NullType>>(
490  buffer, omnisci_column, parquet_column);
491  case parquet::LogicalType::TimeUnit::MICROS:
492  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
493  buffer, omnisci_column, parquet_column);
494  case parquet::LogicalType::TimeUnit::NANOS:
495  return std::make_shared<
496  ParquetTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
497  buffer, omnisci_column, parquet_column);
498  default:
499  UNREACHABLE();
500  }
501  } else {
502  UNREACHABLE();
503  }
504  return {};
505 }
#define UNREACHABLE()
Definition: Logger.h:338
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1332 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1332  {
1333  return omnisci_column->columnType.get_dimension() == 6;
1334 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1336 of file LazyParquetChunkLoader.cpp.

1337  {
1338  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MICROS;
1339 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1341 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1341  {
1342  return omnisci_column->columnType.get_dimension() == 3;
1343 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1345 of file LazyParquetChunkLoader.cpp.

1346  {
1347  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS;
1348 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1323 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1323  {
1324  return omnisci_column->columnType.get_dimension() == 9;
1325 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1327 of file LazyParquetChunkLoader.cpp.

1328  {
1329  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::NANOS;
1330 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_list_column ( const parquet::ColumnDescriptor *  parquet_column)

Detect a valid list parquet column.

Parameters
parquet_column- the parquet column descriptor of the column to detect
Returns
true if it is a valid parquet list column

Note: the notion of a valid parquet list column is adapted from the parquet schema specification for logical type definitions:

<list-repetition> group <name> (LIST) { repeated group list { <element-repetition> <element-type> element; } }

Testing has shown that there are small deviations from this specification in at least one library– pyarrow– where the innermost schema node is named "item" as opposed to "element".

The following is also true of the schema definition.

  • The outer-most level must be a group annotated with LIST that contains a single field named list. The repetition of this level must be either optional or required and determines whether the list is nullable.
  • The middle level, named list, must be a repeated group with a single field named element.
  • The element field encodes the list's element type and repetition. Element repetition must be required or optional.

FSI further restricts lists to be defined only at the top level, meaning directly below the root schema node.

Definition at line 101 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_array_encoder(), set_definition_levels_for_zero_max_definition_level_case(), foreign_storage::LazyParquetChunkLoader::suggestColumnMapping(), validate_allowed_mapping(), validate_column_mapping_and_row_group_metadata(), validate_list_column_metadata_statistics(), and validate_max_repetition_and_definition_level().

101  {
102  const parquet::schema::Node* node = parquet_column->schema_node().get();
103  if ((node->name() != "element" && node->name() != "item") ||
104  !(node->is_required() ||
105  node->is_optional())) { // ensure first innermost node is named "element"
106  // which is required by the parquet specification;
107  // however testing shows that pyarrow generates this
108  // column with the name of "item"
109  // this field must be either required or optional
110  return false;
111  }
112  node = node->parent();
113  if (!node) { // required nested structure
114  return false;
115  }
116  if (node->name() != "list" || !node->is_repeated() ||
117  !node->is_group()) { // ensure second innermost node is named "list" which is
118  // a repeated group; this is
119  // required by the parquet specification
120  return false;
121  }
122  node = node->parent();
123  if (!node) { // required nested structure
124  return false;
125  }
126  if (!node->logical_type()->is_list() ||
127  !(node->is_optional() ||
128  node->is_required())) { // ensure third outermost node has logical type LIST
129  // which is either optional or required; this is required
130  // by the parquet specification
131  return false;
132  }
133  node =
134  node->parent(); // this must now be the root node of schema which is required by
135  // FSI (lists can not be embedded into a deeper nested structure)
136  if (!node) { // required nested structure
137  return false;
138  }
139  node = node->parent();
140  if (node) { // implies the previous node was not the root node
141  return false;
142  }
143  return true;
144 }

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_string ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 59 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_geospatial_encoder(), create_parquet_string_encoder(), suggest_column_scalar_type(), suggest_string_mapping(), validate_geospatial_mapping(), and validate_string_mapping().

59  {
60  return (parquet_column->logical_type()->is_none() &&
61  parquet_column->physical_type() == parquet::Type::BYTE_ARRAY) ||
62  parquet_column->logical_type()->is_string();
63 }

+ Here is the caller graph for this function:

std::list<RowGroupMetadata> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::metadata_scan_rowgroup_interval ( const std::map< int, std::shared_ptr< ParquetEncoder >> &  encoder_map,
const RowGroupInterval &  row_group_interval,
const ReaderPtr &  reader,
const ForeignTableSchema &  schema 
)

Definition at line 1715 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnId, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getLogicalAndPhysicalColumns(), foreign_storage::ForeignTableSchema::getLogicalColumn(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::RowGroupInterval::start_index.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1719  {
1720  std::list<RowGroupMetadata> row_group_metadata;
1721  auto column_interval =
1722  Interval<ColumnType>{schema.getLogicalAndPhysicalColumns().front()->columnId,
1723  schema.getLogicalAndPhysicalColumns().back()->columnId};
1724 
1725  auto file_metadata = reader->parquet_reader()->metadata();
1726  for (int row_group = row_group_interval.start_index;
1727  row_group <= row_group_interval.end_index;
1728  ++row_group) {
1729  auto& row_group_metadata_item = row_group_metadata.emplace_back();
1730  row_group_metadata_item.row_group_index = row_group;
1731  row_group_metadata_item.file_path = row_group_interval.file_path;
1732 
1733  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1734  file_metadata->RowGroup(row_group);
1735 
1736  for (int column_id = column_interval.start; column_id <= column_interval.end;
1737  column_id++) {
1738  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1739  auto parquet_column_index = schema.getParquetColumnIndex(column_id);
1740  auto encoder_map_iter =
1741  encoder_map.find(schema.getLogicalColumn(column_id)->columnId);
1742  CHECK(encoder_map_iter != encoder_map.end());
1743  try {
1744  auto metadata = encoder_map_iter->second->getRowGroupMetadata(
1745  group_metadata.get(), parquet_column_index, column_descriptor->columnType);
1746  row_group_metadata_item.column_chunk_metadata.emplace_back(metadata);
1747  } catch (const std::exception& e) {
1748  std::stringstream error_message;
1749  error_message << e.what() << " in row group " << row_group << " of Parquet file '"
1750  << row_group_interval.file_path << "'.";
1751  throw std::runtime_error(error_message.str());
1752  }
1753  }
1754  }
1755  return row_group_metadata;
1756 }
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_import ( const std::map< int, Chunk_NS::Chunk chunks,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const std::map< int, StringDictionary * >  column_dictionaries,
const int64_t  num_rows,
const bool  geo_validate_geometry 
)

Definition at line 1758 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_import(), shared::get_from_map(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), and foreign_storage::ForeignTableSchema::getParquetColumnIndex().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups().

1764  {
1765  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1766  auto file_metadata = reader->parquet_reader()->metadata();
1767  for (auto& [column_id, chunk] : chunks) {
1768  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1769  if (column_descriptor->isGeoPhyCol) { // skip physical columns
1770  continue;
1771  }
1772  auto parquet_column_descriptor =
1773  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1774  auto find_it = column_dictionaries.find(column_id);
1775  StringDictionary* dictionary =
1776  (find_it == column_dictionaries.end() ? nullptr : find_it->second);
1777  std::list<Chunk_NS::Chunk> chunks_for_import;
1778  chunks_for_import.push_back(chunk);
1779  if (column_descriptor->columnType.is_geometry()) {
1780  for (int i = 0; i < column_descriptor->columnType.get_physical_cols(); ++i) {
1781  chunks_for_import.push_back(chunks.at(column_id + i + 1));
1782  }
1783  }
1784  encoder_map[column_id] = create_parquet_encoder_for_import(chunks_for_import,
1785  column_descriptor,
1786  parquet_column_descriptor,
1787  dictionary,
1788  geo_validate_geometry);
1789 
1790  // reserve space in buffer when num-elements known ahead of time for types
1791  // of known size (for example dictionary encoded strings)
1792  auto encoder = shared::get_from_map(encoder_map, column_id);
1793  if (auto inplace_encoder = dynamic_cast<ParquetInPlaceEncoder*>(encoder.get())) {
1794  inplace_encoder->reserve(num_rows);
1795  }
1796  }
1797  return encoder_map;
1798 }
V & get_from_map(std::map< K, V, comp > &map, const K &key)
Definition: misc.h:61
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_import(std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary, const bool geo_validate_geometry)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_metadata_scan ( const Interval< ColumnType > &  column_interval,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const bool  do_metadata_stats_validation,
const bool  geo_validate_geometry 
)

Definition at line 1800 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_metadata_scan(), foreign_storage::Interval< T >::end, shared::get_from_map(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::Interval< T >::start.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1805  {
1806  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1807  auto file_metadata = reader->parquet_reader()->metadata();
1808  for (int column_id = column_interval.start; column_id <= column_interval.end;
1809  column_id++) {
1810  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1811  auto parquet_column_descriptor =
1812  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1813  encoder_map[column_id] = create_parquet_encoder_for_metadata_scan(
1814  column_descriptor, parquet_column_descriptor, geo_validate_geometry);
1815  if (!do_metadata_stats_validation) {
1816  shared::get_from_map(encoder_map, column_id)->disableMetadataStatsValidation();
1817  }
1818  column_id += column_descriptor->columnType.get_physical_cols();
1819  }
1820  return encoder_map;
1821 }
T const end
Definition: Intervals.h:68
V & get_from_map(std::map< K, V, comp > &map, const K &key)
Definition: misc.h:61
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_metadata_scan(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const bool geo_validate_geometry)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::resize_values_buffer ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::vector< int8_t > &  values 
)

Definition at line 1170 of file LazyParquetChunkLoader.cpp.

References foreign_storage::LazyParquetChunkLoader::batch_reader_num_elements, ColumnDescriptor::columnType, and SQLTypeInfo::get_size().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1172  {
1173  auto max_type_byte_size =
1174  std::max(omnisci_column->columnType.get_size(),
1175  parquet::GetTypeByteSize(parquet_column->physical_type()));
1176  size_t values_size =
1177  LazyParquetChunkLoader::batch_reader_num_elements * max_type_byte_size;
1178  values.resize(values_size);
1179 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::set_definition_levels_for_zero_max_definition_level_case ( const parquet::ColumnDescriptor *  parquet_column_descriptor,
std::vector< int16_t > &  def_levels 
)

This function sets the definition levels to 1 for all read values in the case of required scalar/flat columns. The definition level of one informs all subsequent calls to parquet encoders to treat the read data as not null.

Definition at line 1113 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_list_column().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups().

1115  {
1116  if (!is_valid_parquet_list_column(parquet_column_descriptor) &&
1117  parquet_column_descriptor->max_definition_level() == 0) {
1118  if (!parquet_column_descriptor->schema_node()->is_required()) {
1119  throw std::runtime_error(
1120  "Unsupported parquet column detected. Column '" +
1121  parquet_column_descriptor->path()->ToDotString() +
1122  "' detected to have max definition level of 0 but is optional.");
1123  }
1124  def_levels.assign(def_levels.size(), 1);
1125  }
1126 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_boolean_type_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1359 of file LazyParquetChunkLoader.cpp.

References kBOOLEAN, kENCODING_NONE, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1360  {
1361  SQLTypeInfo type;
1363  type.set_type(kBOOLEAN);
1364  type.set_fixed_size();
1365  return type;
1366 }
void set_compression(EncodingType c)
Definition: sqltypes.h:479
void set_fixed_size()
Definition: sqltypes.h:477
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:468

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_column_scalar_type ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1564 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_string(), suggest_boolean_type_mapping(), suggest_date_mapping(), suggest_decimal_mapping(), suggest_floating_point_mapping(), suggest_integral_mapping(), suggest_string_mapping(), suggest_time_mapping(), and suggest_timestamp_mapping().

Referenced by foreign_storage::LazyParquetChunkLoader::suggestColumnMapping().

1564  {
1565  // decimal case
1566  if (parquet_column->logical_type()->is_decimal()) {
1567  return suggest_decimal_mapping(parquet_column);
1568  }
1569  // float case
1570  if (parquet_column->logical_type()->is_none() &&
1571  (parquet_column->physical_type() == parquet::Type::FLOAT ||
1572  parquet_column->physical_type() == parquet::Type::DOUBLE)) {
1573  return suggest_floating_point_mapping(parquet_column);
1574  }
1575  // integral case
1576  if ((parquet_column->logical_type()->is_none() &&
1577  (parquet_column->physical_type() == parquet::Type::INT32 ||
1578  parquet_column->physical_type() == parquet::Type::INT64)) ||
1579  parquet_column->logical_type()->is_int()) {
1580  return suggest_integral_mapping(parquet_column);
1581  }
1582  // boolean case
1583  if (parquet_column->logical_type()->is_none() &&
1584  parquet_column->physical_type() == parquet::Type::BOOLEAN) {
1585  return suggest_boolean_type_mapping(parquet_column);
1586  }
1587  // timestamp case
1588  if (parquet_column->logical_type()->is_timestamp()) {
1589  return suggest_timestamp_mapping(parquet_column);
1590  }
1591  // time case
1592  if (parquet_column->logical_type()->is_time()) {
1593  return suggest_time_mapping(parquet_column);
1594  }
1595  // date case
1596  if (parquet_column->logical_type()->is_date()) {
1597  return suggest_date_mapping(parquet_column);
1598  }
1599  // string case
1600  if (is_valid_parquet_string(parquet_column)) {
1601  return suggest_string_mapping(parquet_column);
1602  }
1603 
1604  throw ForeignStorageException("Unsupported data type detected for column: " +
1605  parquet_column->ToString());
1606 }
SQLTypeInfo suggest_decimal_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_timestamp_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_string_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_date_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_floating_point_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_integral_mapping(const parquet::ColumnDescriptor *parquet_column)
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_boolean_type_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_time_mapping(const parquet::ColumnDescriptor *parquet_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_date_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1463 of file LazyParquetChunkLoader.cpp.

References CHECK, kDATE, kENCODING_NONE, and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1463  {
1464  CHECK(parquet_column->logical_type()->is_date());
1465  SQLTypeInfo type;
1466  type.set_type(kDATE);
1467  type.set_compression(kENCODING_NONE);
1468  type.set_fixed_size();
1469  return type;
1470 }
Definition: sqltypes.h:80
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_decimal_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1195 of file LazyParquetChunkLoader.cpp.

References kDECIMAL, kENCODING_NONE, sql_constants::kMaxNumericPrecision, SQLTypeInfo::scale, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_precision(), SQLTypeInfo::set_scale(), SQLTypeInfo::set_type(), to_string(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1195  {
1196  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1197  parquet_column->logical_type().get())) {
1198  auto parquet_precision = decimal_logical_column->precision();
1199  auto parquet_scale = decimal_logical_column->scale();
1200  if (parquet_precision > sql_constants::kMaxNumericPrecision) {
1201  throw ForeignStorageException(
1202  "Parquet column \"" + parquet_column->ToString() +
1203  "\" has decimal precision of " + std::to_string(parquet_precision) +
1204  " which is too high to import, maximum precision supported is " +
1206  }
1207  SQLTypeInfo type;
1208  type.set_type(kDECIMAL);
1210  type.set_precision(parquet_precision);
1211  type.set_scale(parquet_scale);
1212  type.set_fixed_size();
1213  return type;
1214  }
1215  UNREACHABLE()
1216  << " a Parquet column's decimal logical type failed to be read appropriately";
1217  return {};
1218 }
void set_compression(EncodingType c)
Definition: sqltypes.h:479
static constexpr int32_t kMaxNumericPrecision
Definition: sqltypes.h:58
#define UNREACHABLE()
Definition: Logger.h:338
std::string to_string(char const *&&v)
void set_fixed_size()
Definition: sqltypes.h:477
void set_scale(int s)
Definition: sqltypes.h:473
void set_precision(int d)
Definition: sqltypes.h:471
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:468

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_floating_point_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1236 of file LazyParquetChunkLoader.cpp.

References kDOUBLE, kENCODING_NONE, kFLOAT, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1237  {
1238  SQLTypeInfo type;
1239  if (parquet_column->physical_type() == parquet::Type::FLOAT) {
1240  type.set_type(kFLOAT);
1241  } else if (parquet_column->physical_type() == parquet::Type::DOUBLE) {
1242  type.set_type(kDOUBLE);
1243  } else {
1244  UNREACHABLE();
1245  }
1247  type.set_fixed_size();
1248  return type;
1249 }
void set_compression(EncodingType c)
Definition: sqltypes.h:479
#define UNREACHABLE()
Definition: Logger.h:338
void set_fixed_size()
Definition: sqltypes.h:477
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:468

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_integral_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1278 of file LazyParquetChunkLoader.cpp.

References CHECK, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), run_benchmark_import::type, and within_range().

Referenced by suggest_column_scalar_type().

1278  {
1279  SQLTypeInfo type;
1281  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1282  parquet_column->logical_type().get())) {
1283  auto bit_width = int_logical_column->bit_width();
1284  if (!int_logical_column->is_signed()) {
1285  if (within_range(33, 64, bit_width)) {
1286  throw ForeignStorageException(
1287  "Unsigned integer column \"" + parquet_column->path()->ToDotString() +
1288  "\" in Parquet file with 64 bit-width has no supported type for ingestion "
1289  "that will not result in data loss");
1290  } else if (within_range(17, 32, bit_width)) {
1291  type.set_type(kBIGINT);
1292  } else if (within_range(9, 16, bit_width)) {
1293  type.set_type(kINT);
1294  } else if (within_range(0, 8, bit_width)) {
1295  type.set_type(kSMALLINT);
1296  }
1297  } else {
1298  if (within_range(33, 64, bit_width)) {
1299  type.set_type(kBIGINT);
1300  } else if (within_range(17, 32, bit_width)) {
1301  type.set_type(kINT);
1302  } else if (within_range(9, 16, bit_width)) {
1303  type.set_type(kSMALLINT);
1304  } else if (within_range(0, 8, bit_width)) {
1305  type.set_type(kTINYINT);
1306  }
1307  }
1308  type.set_fixed_size();
1309  return type;
1310  }
1311 
1312  CHECK(parquet_column->logical_type()->is_none());
1313  if (parquet_column->physical_type() == parquet::Type::INT32) {
1314  type.set_type(kINT);
1315  } else {
1316  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
1317  type.set_type(kBIGINT);
1318  }
1319  type.set_fixed_size();
1320  return type;
1321 }
void set_compression(EncodingType c)
Definition: sqltypes.h:479
void set_fixed_size()
Definition: sqltypes.h:477
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqltypes.h:72
bool within_range(int64_t lower_bound, int64_t upper_bound, int64_t value)
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:468

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_string_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1480 of file LazyParquetChunkLoader.cpp.

References CHECK, is_valid_parquet_string(), kENCODING_DICT, kTEXT, SQLTypeInfo::set_comp_param(), SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1480  {
1481  CHECK(is_valid_parquet_string(parquet_column));
1482  SQLTypeInfo type;
1483  type.set_type(kTEXT);
1485  type.set_comp_param(32);
1486  type.set_fixed_size();
1487  return type;
1488 }
void set_compression(EncodingType c)
Definition: sqltypes.h:479
void set_fixed_size()
Definition: sqltypes.h:477
void set_comp_param(int p)
Definition: sqltypes.h:480
Definition: sqltypes.h:79
#define CHECK(condition)
Definition: Logger.h:291
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:468

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_time_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1437 of file LazyParquetChunkLoader.cpp.

References CHECK, kENCODING_NONE, kTIME, and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1437  {
1438  CHECK(parquet_column->logical_type()->is_time());
1439  SQLTypeInfo type;
1440  type.set_type(kTIME);
1441  type.set_compression(kENCODING_NONE);
1442  type.set_fixed_size();
1443  return type;
1444 }
Definition: sqltypes.h:76
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_timestamp_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1403 of file LazyParquetChunkLoader.cpp.

References is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_NONE, kTIMESTAMP, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_precision(), SQLTypeInfo::set_type(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1403  {
1404  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1405  parquet_column->logical_type().get())) {
1406  SQLTypeInfo type;
1407  type.set_type(kTIMESTAMP);
1409  if (is_nanosecond_precision(timestamp_logical_column)) {
1410  type.set_precision(9);
1411  } else if (is_microsecond_precision(timestamp_logical_column)) {
1412  type.set_precision(6);
1413  } else if (is_millisecond_precision(timestamp_logical_column)) {
1414  type.set_precision(3);
1415  }
1416  type.set_fixed_size();
1417  return type;
1418  }
1419  UNREACHABLE();
1420  return {};
1421 }
void set_compression(EncodingType c)
Definition: sqltypes.h:479
#define UNREACHABLE()
Definition: Logger.h:338
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
void set_fixed_size()
Definition: sqltypes.h:477
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
void set_precision(int d)
Definition: sqltypes.h:471
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:468

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_missing_metadata_error ( const int  row_group_index,
const int  column_index,
const std::string &  file_path 
)

Definition at line 1618 of file LazyParquetChunkLoader.cpp.

References to_string().

Referenced by validate_column_mapping_and_row_group_metadata().

1620  {
1621  throw std::runtime_error{
1622  "Statistics metadata is required for all row groups. Metadata is missing for "
1623  "row group index: " +
1624  std::to_string(row_group_index) +
1625  ", column index: " + std::to_string(column_index) + ", file path: " + file_path};
1626 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_row_group_larger_than_fragment_size_error ( const MaxRowGroupSizeStats  max_row_group_stats,
const int  fragment_size 
)

Definition at line 1634 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::file_path, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::max_row_group_index, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::max_row_group_size, and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1636  {
1637  auto metadata_scan_exception = MetadataScanInfeasibleFragmentSizeException{
1638  "Parquet file has a row group size that is larger than the fragment size. "
1639  "Please set the table fragment size to a number that is larger than the "
1640  "row group size. Row group index: " +
1641  std::to_string(max_row_group_stats.max_row_group_index) +
1642  ", row group size: " + std::to_string(max_row_group_stats.max_row_group_size) +
1643  ", fragment size: " + std::to_string(fragment_size) +
1644  ", file path: " + max_row_group_stats.file_path};
1645  metadata_scan_exception.min_feasible_fragment_size_ =
1646  max_row_group_stats.max_row_group_size;
1647  throw metadata_scan_exception;
1648 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping ( const parquet::ColumnDescriptor *  parquet_column,
const ColumnDescriptor omnisci_column 
)

Definition at line 1522 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnName, ColumnDescriptor::columnType, foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::get_type_name(), SQLTypeInfo::is_array(), is_valid_parquet_list_column(), foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported(), LOG, run_benchmark_import::type, validate_max_repetition_and_definition_level(), and logger::WARNING.

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_column_mapping_and_row_group_metadata().

1523  {
1524  validate_max_repetition_and_definition_level(omnisci_column, parquet_column);
1525  bool allowed_type = false;
1526  if (omnisci_column->columnType.is_array()) {
1527  if (is_valid_parquet_list_column(parquet_column)) {
1528  auto omnisci_column_sub_type_column =
1529  get_sub_type_column_descriptor(omnisci_column);
1530  allowed_type = LazyParquetChunkLoader::isColumnMappingSupported(
1531  omnisci_column_sub_type_column.get(), parquet_column);
1532  }
1533  } else {
1534  allowed_type =
1535  LazyParquetChunkLoader::isColumnMappingSupported(omnisci_column, parquet_column);
1536  }
1537  if (!allowed_type) {
1538  auto logical_type = parquet_column->logical_type();
1539  if (logical_type->is_timestamp()) {
1540  auto timestamp_type =
1541  dynamic_cast<const parquet::TimestampLogicalType*>(logical_type.get());
1542  CHECK(timestamp_type);
1543 
1544  if (!timestamp_type->is_adjusted_to_utc()) {
1545  LOG(WARNING) << "Non-UTC timezone specified in Parquet file for column \""
1546  << omnisci_column->columnName
1547  << "\". Only UTC timezone is currently supported.";
1548  }
1549  }
1550  std::string parquet_type;
1551  parquet::Type::type physical_type = parquet_column->physical_type();
1552  if (parquet_column->logical_type()->is_none()) {
1553  parquet_type = parquet::TypeToString(physical_type);
1554  } else {
1555  parquet_type = logical_type->ToString();
1556  }
1557  std::string omnisci_type = omnisci_column->columnType.get_type_name();
1558  throw std::runtime_error{"Conversion from Parquet type \"" + parquet_type +
1559  "\" to HeavyDB type \"" + omnisci_type +
1560  "\" is not allowed. Please use an appropriate column type."};
1561  }
1562 }
#define LOG(tag)
Definition: Logger.h:285
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::string get_type_name() const
Definition: sqltypes.h:482
#define CHECK(condition)
Definition: Logger.h:291
void validate_max_repetition_and_definition_level(const ColumnDescriptor *omnisci_column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
SQLTypeInfo columnType
std::string columnName
bool is_array() const
Definition: sqltypes.h:583

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

MaxRowGroupSizeStats foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_column_mapping_and_row_group_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema,
const bool  do_metadata_stats_validation 
)

Definition at line 1650 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::getLogicalColumns(), is_valid_parquet_list_column(), report::stats, throw_missing_metadata_error(), and validate_allowed_mapping().

Referenced by validate_parquet_metadata().

1654  {
1655  auto column_it = schema.getLogicalColumns().begin();
1656  MaxRowGroupSizeStats max_row_group_stats{0, 0};
1657  for (int i = 0; i < file_metadata->num_columns(); ++i, ++column_it) {
1658  const parquet::ColumnDescriptor* descr = file_metadata->schema()->Column(i);
1659  try {
1660  validate_allowed_mapping(descr, *column_it);
1661  } catch (std::runtime_error& e) {
1662  std::stringstream error_message;
1663  error_message << e.what() << " Parquet column: " << descr->path()->ToDotString()
1664  << ", HeavyDB column: " << (*column_it)->columnName
1665  << ", Parquet file: " << file_path << ".";
1666  throw std::runtime_error(error_message.str());
1667  }
1668 
1669  for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
1670  auto group_metadata = file_metadata->RowGroup(r);
1671  auto num_rows = group_metadata->num_rows();
1672  if (num_rows == 0) {
1673  continue;
1674  } else if (num_rows > max_row_group_stats.max_row_group_size) {
1675  max_row_group_stats.max_row_group_size = num_rows;
1676  max_row_group_stats.max_row_group_index = r;
1677  max_row_group_stats.file_path = file_path;
1678  }
1679 
1680  if (do_metadata_stats_validation) {
1681  auto column_chunk = group_metadata->ColumnChunk(i);
1682  bool contains_metadata = column_chunk->is_stats_set();
1683  if (contains_metadata) {
1684  auto stats = column_chunk->statistics();
1685  bool is_all_nulls = stats->null_count() == column_chunk->num_values();
1686  bool is_list = is_valid_parquet_list_column(file_metadata->schema()->Column(i));
1687  // Given a list, it is possible it has no min or max if it is comprised
1688  // only of empty lists & nulls. This can not be detected by comparing
1689  // the null count; therefore we afford list types the benefit of the
1690  // doubt in this situation.
1691  if (!(stats->HasMinMax() || is_all_nulls || is_list)) {
1692  contains_metadata = false;
1693  }
1694  }
1695 
1696  if (!contains_metadata) {
1697  throw_missing_metadata_error(r, i, file_path);
1698  }
1699  }
1700  }
1701  }
1702  return max_row_group_stats;
1703 }
dictionary stats
Definition: report.py:116
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
void throw_missing_metadata_error(const int row_group_index, const int column_index, const std::string &file_path)
void validate_allowed_mapping(const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_date_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1446 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kDATE, kENCODING_DATE_IN_DAYS, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1447  {
1448  if (!(omnisci_column->columnType.get_type() == kDATE &&
1449  ((omnisci_column->columnType.get_compression() == kENCODING_DATE_IN_DAYS &&
1450  (omnisci_column->columnType.get_comp_param() ==
1451  0 // DATE ENCODING DAYS (32) specifies comp_param of 0
1452  || omnisci_column->columnType.get_comp_param() == 16)) ||
1453  omnisci_column->columnType.get_compression() ==
1454  kENCODING_NONE // for array types
1455  ))) {
1456  return false;
1457  }
1458  return parquet_column->logical_type()->is_date() ||
1459  parquet_column->logical_type()
1460  ->is_timestamp(); // to support TIMESTAMP -> DATE coercion
1461 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
Definition: sqltypes.h:80
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_decimal_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1181 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_precision(), SQLTypeInfo::get_scale(), SQLTypeInfo::is_decimal(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1182  {
1183  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1184  parquet_column->logical_type().get())) {
1185  return omnisci_column->columnType.get_precision() ==
1186  decimal_logical_column->precision() &&
1187  omnisci_column->columnType.get_scale() == decimal_logical_column->scale() &&
1188  omnisci_column->columnType.is_decimal() &&
1189  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1190  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1191  }
1192  return false;
1193 }
HOST DEVICE int get_scale() const
Definition: sqltypes.h:396
int get_precision() const
Definition: sqltypes.h:394
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
SQLTypeInfo columnType
bool is_decimal() const
Definition: sqltypes.h:568

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema ( const parquet::arrow::FileReader *  reference_file_reader,
const parquet::arrow::FileReader *  new_file_reader,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 1496 of file LazyParquetChunkLoader.cpp.

References foreign_storage::get_column_descriptor(), to_string(), and foreign_storage::validate_equal_column_descriptor().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan(), and foreign_storage::LazyParquetChunkLoader::previewFiles().

1499  {
1500  const auto reference_num_columns =
1501  reference_file_reader->parquet_reader()->metadata()->num_columns();
1502  const auto new_num_columns =
1503  new_file_reader->parquet_reader()->metadata()->num_columns();
1504  if (reference_num_columns != new_num_columns) {
1505  throw std::runtime_error{"Parquet file \"" + new_file_path +
1506  "\" has a different schema. Please ensure that all Parquet "
1507  "files use the same schema. Reference Parquet file: \"" +
1508  reference_file_path + "\" has " +
1509  std::to_string(reference_num_columns) +
1510  " columns. New Parquet file \"" + new_file_path + "\" has " +
1511  std::to_string(new_num_columns) + " columns."};
1512  }
1513 
1514  for (int i = 0; i < reference_num_columns; i++) {
1515  validate_equal_column_descriptor(get_column_descriptor(reference_file_reader, i),
1516  get_column_descriptor(new_file_reader, i),
1517  reference_file_path,
1518  new_file_path);
1519  }
1520 }
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
std::string to_string(char const *&&v)
const ColumnDescriptor * get_column_descriptor(const shared::ColumnKey &column_key)
Definition: Execute.h:213

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_floating_point_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1220 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), SQLTypeInfo::is_fp(), kENCODING_NONE, and kFLOAT.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1221  {
1222  if (!omnisci_column->columnType.is_fp()) {
1223  return false;
1224  }
1225  // check if mapping is a valid coerced or non-coerced floating point mapping
1226  // with no annotation (floating point columns have no annotation in the
1227  // Parquet specification)
1228  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1229  return (parquet_column->physical_type() == parquet::Type::DOUBLE) ||
1230  (parquet_column->physical_type() == parquet::Type::FLOAT &&
1231  omnisci_column->columnType.get_type() == kFLOAT);
1232  }
1233  return false;
1234 }
bool is_fp() const
Definition: sqltypes.h:571
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_geospatial_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1490 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_geometry(), and is_valid_parquet_string().

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1491  {
1492  return is_valid_parquet_string(parquet_column) &&
1493  omnisci_column->columnType.is_geometry();
1494 }
bool is_geometry() const
Definition: sqltypes.h:595
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_integral_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1251 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_size(), SQLTypeInfo::is_integer(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1252  {
1253  if (!omnisci_column->columnType.is_integer()) {
1254  return false;
1255  }
1256  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1257  parquet_column->logical_type().get())) {
1258  CHECK(omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1259  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1260  const int bits_per_byte = 8;
1261  // unsigned types are permitted to map to a wider integral type in order to avoid
1262  // precision loss
1263  const int bit_widening_factor = int_logical_column->is_signed() ? 1 : 2;
1264  return omnisci_column->columnType.get_size() * bits_per_byte <=
1265  int_logical_column->bit_width() * bit_widening_factor;
1266  }
1267  // check if mapping is a valid coerced or non-coerced integral mapping with no
1268  // annotation
1269  if ((omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1270  omnisci_column->columnType.get_compression() == kENCODING_FIXED)) {
1271  return (parquet_column->physical_type() == parquet::Type::INT64) ||
1272  (parquet_column->physical_type() == parquet::Type::INT32 &&
1273  omnisci_column->columnType.get_size() <= 4);
1274  }
1275  return false;
1276 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
bool is_integer() const
Definition: sqltypes.h:565
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_list_column_metadata_statistics ( const parquet::ParquetFileReader *  reader,
const int  row_group_index,
const int  column_index,
const int16_t *  def_levels,
const int64_t  num_levels,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1077 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_list_column(), report::stats, and foreign_storage::validate_and_get_column_metadata_statistics().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1083  {
1084  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1085  if (!is_valid_parquet_list) {
1086  return;
1087  }
1088  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1089  reader->metadata()->RowGroup(row_group_index);
1090  auto column_metadata = group_metadata->ColumnChunk(column_index);
1091  // In case of a empty row group do not validate
1092  if (group_metadata->num_rows() == 0) {
1093  return;
1094  }
1095  auto stats = validate_and_get_column_metadata_statistics(column_metadata.get());
1096  if (!stats->HasMinMax()) {
1097  auto find_it = std::find_if(def_levels,
1098  def_levels + num_levels,
1099  [](const int16_t def_level) { return def_level == 3; });
1100  if (find_it != def_levels + num_levels) {
1101  throw std::runtime_error(
1102  "No minimum and maximum statistic set in list column but non-null & non-empty "
1103  "array/value detected.");
1104  }
1105  }
1106 }
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
dictionary stats
Definition: report.py:116
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_max_repetition_and_definition_level ( const ColumnDescriptor omnisci_column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1128 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::is_array(), is_valid_parquet_list_column(), and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_allowed_mapping().

1130  {
1131  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1132  if (is_valid_parquet_list && !omnisci_column_descriptor->columnType.is_array()) {
1133  throw std::runtime_error(
1134  "Unsupported mapping detected. Column '" +
1135  parquet_column_descriptor->path()->ToDotString() +
1136  "' detected to be a parquet list but HeavyDB mapped column '" +
1137  omnisci_column_descriptor->columnName + "' is not an array.");
1138  }
1139  if (is_valid_parquet_list) {
1140  if (parquet_column_descriptor->max_repetition_level() != 1 ||
1141  parquet_column_descriptor->max_definition_level() != 3) {
1142  throw std::runtime_error(
1143  "Incorrect schema max repetition level detected in column '" +
1144  parquet_column_descriptor->path()->ToDotString() +
1145  "'. Expected a max repetition level of 1 and max definition level of 3 for "
1146  "list column but column has a max "
1147  "repetition level of " +
1148  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1149  " and a max definition level of " +
1150  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1151  }
1152  } else {
1153  if (parquet_column_descriptor->max_repetition_level() != 0 ||
1154  !(parquet_column_descriptor->max_definition_level() == 1 ||
1155  parquet_column_descriptor->max_definition_level() == 0)) {
1156  throw std::runtime_error(
1157  "Incorrect schema max repetition level detected in column '" +
1158  parquet_column_descriptor->path()->ToDotString() +
1159  "'. Expected a max repetition level of 0 and max definition level of 1 or 0 "
1160  "for "
1161  "flat column but column has a max "
1162  "repetition level of " +
1163  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1164  " and a max definition level of " +
1165  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1166  }
1167  }
1168 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::string to_string(char const *&&v)
SQLTypeInfo columnType
std::string columnName
bool is_array() const
Definition: sqltypes.h:583

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_none_type_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1350 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kBOOLEAN, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1351  {
1352  bool is_none_encoded_mapping =
1353  omnisci_column->columnType.get_compression() == kENCODING_NONE &&
1354  (parquet_column->physical_type() == parquet::Type::BOOLEAN &&
1355  omnisci_column->columnType.get_type() == kBOOLEAN);
1356  return parquet_column->logical_type()->is_none() && is_none_encoded_mapping;
1357 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1608 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::numLogicalColumns(), and foreign_storage::throw_number_of_columns_mismatch_error().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_parquet_metadata().

1611  {
1612  if (schema.numLogicalColumns() != file_metadata->num_columns()) {
1614  schema.numLogicalColumns(), file_metadata->num_columns(), file_path);
1615  }
1616 }
void throw_number_of_columns_mismatch_error(size_t num_table_cols, size_t num_file_cols, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

MaxRowGroupSizeStats foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_parquet_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema,
const bool  do_metadata_stats_validation 
)

Definition at line 1705 of file LazyParquetChunkLoader.cpp.

References validate_column_mapping_and_row_group_metadata(), and validate_number_of_columns().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1709  {
1710  validate_number_of_columns(file_metadata, file_path, schema);
1712  file_metadata, file_path, schema, do_metadata_stats_validation);
1713 }
MaxRowGroupSizeStats validate_column_mapping_and_row_group_metadata(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema, const bool do_metadata_stats_validation)
void validate_number_of_columns(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_string_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1472 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1473  {
1474  return is_valid_parquet_string(parquet_column) &&
1475  omnisci_column->columnType.is_string() &&
1476  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1477  omnisci_column->columnType.get_compression() == kENCODING_DICT);
1478 }
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:559

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_time_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1423 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kENCODING_FIXED, kENCODING_NONE, and kTIME.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1424  {
1425  if (!(omnisci_column->columnType.get_type() == kTIME &&
1426  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1427  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1428  omnisci_column->columnType.get_comp_param() == 32)))) {
1429  return false;
1430  }
1431  if (parquet_column->logical_type()->is_time()) {
1432  return true;
1433  }
1434  return false;
1435 }
Definition: sqltypes.h:76
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_timestamp_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1368 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_dimension(), SQLTypeInfo::get_type(), is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_FIXED, kENCODING_NONE, and kTIMESTAMP.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1369  {
1370  if (!(omnisci_column->columnType.get_type() == kTIMESTAMP &&
1371  ((omnisci_column->columnType.get_compression() == kENCODING_NONE) ||
1372  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1373  omnisci_column->columnType.get_comp_param() == 32)))) {
1374  return false;
1375  }
1376  // check the annotated case
1377  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1378  parquet_column->logical_type().get())) {
1379  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1380  return omnisci_column->columnType.get_dimension() == 0 ||
1381  ((is_nanosecond_precision(omnisci_column) &&
1382  is_nanosecond_precision(timestamp_logical_column)) ||
1383  (is_microsecond_precision(omnisci_column) &&
1384  is_microsecond_precision(timestamp_logical_column)) ||
1385  (is_millisecond_precision(omnisci_column) &&
1386  is_millisecond_precision(timestamp_logical_column)));
1387  }
1388  if (omnisci_column->columnType.get_compression() == kENCODING_FIXED) {
1389  return omnisci_column->columnType.get_dimension() == 0;
1390  }
1391  }
1392  // check the unannotated case
1393  if (parquet_column->logical_type()->is_none() &&
1394  ((parquet_column->physical_type() == parquet::Type::INT32 &&
1395  omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1396  omnisci_column->columnType.get_comp_param() == 32) ||
1397  parquet_column->physical_type() == parquet::Type::INT64)) {
1398  return true;
1399  }
1400  return false;
1401 }
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::within_range ( int64_t  lower_bound,
int64_t  upper_bound,
int64_t  value 
)

Definition at line 55 of file LazyParquetChunkLoader.cpp.

References gpu_enabled::upper_bound().

Referenced by suggest_integral_mapping().

55  {
56  return value >= lower_bound && value <= upper_bound;
57 }
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78

+ Here is the call graph for this function:

+ Here is the caller graph for this function: