OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp} Namespace Reference

Classes

struct  MaxRowGroupSizeStats
 

Functions

bool within_range (int64_t lower_bound, int64_t upper_bound, int64_t value)
 
bool is_valid_parquet_string (const parquet::ColumnDescriptor *parquet_column)
 
bool is_valid_parquet_list_column (const parquet::ColumnDescriptor *parquet_column)
 Detect a valid list parquet column. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder_with_omnisci_type (const ColumnDescriptor *column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename U , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_signed_or_unsigned_integral_encoder_with_types (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
 Create a signed or unsigned integral parquet encoder using types. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder_with_omnisci_type (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const int bit_width, const bool is_signed)
 Create a integral parquet encoder using types. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_floating_point_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_none_type_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_string_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import, const bool is_for_detect)
 
std::shared_ptr< ParquetEncodercreate_parquet_geospatial_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool geo_validate_geometry)
 
std::shared_ptr< ParquetEncodercreate_parquet_array_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
 Create a Parquet specific encoder for a Parquet to OmniSci mapping. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_import (std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary, const bool geo_validate_geometry)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_metadata_scan (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const bool geo_validate_geometry)
 
void validate_list_column_metadata_statistics (const parquet::ParquetFileReader *reader, const int row_group_index, const int column_index, const int16_t *def_levels, const int64_t num_levels, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void set_definition_levels_for_zero_max_definition_level_case (const parquet::ColumnDescriptor *parquet_column_descriptor, std::vector< int16_t > &def_levels)
 
void validate_max_repetition_and_definition_level (const ColumnDescriptor *omnisci_column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void resize_values_buffer (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::vector< int8_t > &values)
 
bool validate_decimal_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_decimal_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_floating_point_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_floating_point_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_integral_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_integral_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool is_nanosecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_nanosecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_microsecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_microsecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_millisecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_millisecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool validate_none_type_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_boolean_type_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_timestamp_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_timestamp_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_time_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_time_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_date_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_date_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_string_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_string_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_geospatial_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
void validate_equal_schema (const parquet::arrow::FileReader *reference_file_reader, const parquet::arrow::FileReader *new_file_reader, const std::string &reference_file_path, const std::string &new_file_path)
 
void validate_allowed_mapping (const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)
 
SQLTypeInfo suggest_column_scalar_type (const parquet::ColumnDescriptor *parquet_column)
 
void validate_number_of_columns (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
void throw_missing_metadata_error (const int row_group_index, const int column_index, const std::string &file_path)
 
void throw_row_group_larger_than_fragment_size_error (const MaxRowGroupSizeStats max_row_group_stats, const int fragment_size)
 
MaxRowGroupSizeStats validate_column_mapping_and_row_group_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema, const bool do_metadata_stats_validation)
 
MaxRowGroupSizeStats validate_parquet_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema, const bool do_metadata_stats_validation)
 
std::list< RowGroupMetadatametadata_scan_rowgroup_interval (const std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const RowGroupInterval &row_group_interval, const ReaderPtr &reader, const ForeignTableSchema &schema)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_import (const std::map< int, Chunk_NS::Chunk > chunks, const ForeignTableSchema &schema, const ReaderPtr &reader, const std::map< int, StringDictionary * > column_dictionaries, const int64_t num_rows, const bool geo_validate_geometry)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_metadata_scan (const Interval< ColumnType > &column_interval, const ForeignTableSchema &schema, const ReaderPtr &reader, const bool do_metadata_stats_validation, const bool geo_validate_geometry)
 

Function Documentation

std::shared_ptr< ParquetEncoder > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import,
const bool  is_for_detect,
const bool  geo_validate_geometry 
)

Definition at line 1025 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_encoder(), foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::is_array(), SQLTypeInfo::is_fixlen_array(), and is_valid_parquet_list_column().

Referenced by create_parquet_encoder().

1034  {
1035  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column);
1036  if (!is_valid_parquet_list || !omnisci_column->columnType.is_array()) {
1037  return {};
1038  }
1039  std::unique_ptr<ColumnDescriptor> omnisci_column_sub_type_column =
1040  get_sub_type_column_descriptor(omnisci_column);
1041  auto encoder = create_parquet_encoder(omnisci_column_sub_type_column.get(),
1042  parquet_column,
1043  chunks,
1044  string_dictionary,
1045  chunk_metadata,
1046  is_metadata_scan,
1047  is_for_import,
1048  is_for_detect,
1049  geo_validate_geometry);
1050  CHECK(encoder.get());
1051  auto scalar_encoder = std::dynamic_pointer_cast<ParquetScalarEncoder>(encoder);
1052  CHECK(scalar_encoder);
1053  if (!is_for_import) {
1054  if (!is_for_detect) {
1055  if (omnisci_column->columnType.is_fixlen_array()) {
1056  encoder = std::make_shared<ParquetFixedLengthArrayEncoder>(
1057  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1058  scalar_encoder,
1059  omnisci_column);
1060  } else {
1061  encoder = std::make_shared<ParquetVariableLengthArrayEncoder>(
1062  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1063  is_metadata_scan ? nullptr : chunks.begin()->getIndexBuf(),
1064  scalar_encoder,
1065  omnisci_column);
1066  }
1067  } else { // is_for_detect
1068  encoder = std::make_shared<ParquetArrayDetectEncoder>(
1069  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1070  }
1071  } else { // is_for_import
1072  encoder = std::make_shared<ParquetArrayImportEncoder>(
1073  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1074  }
1075  return encoder;
1076 }
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.
bool is_fixlen_array() const
Definition: sqltypes.h:591
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType
bool is_array() const
Definition: sqltypes.h:585

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 737 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, kENCODING_DATE_IN_DAYS, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

741  {
742  auto column_type = omnisci_column->columnType;
743  if (parquet_column->logical_type()->is_date() && column_type.is_date()) {
744  if (column_type.get_compression() == kENCODING_DATE_IN_DAYS) {
745  if (is_metadata_scan_or_for_import) {
746  if (column_type.get_comp_param() ==
747  0) { // DATE ENCODING FIXED (32) uses comp param 0
748  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int32_t>>(
749  buffer);
750  } else if (column_type.get_comp_param() == 16) {
751  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int16_t>>(
752  buffer);
753  } else {
754  UNREACHABLE();
755  }
756  } else {
757  if (column_type.get_comp_param() ==
758  0) { // DATE ENCODING FIXED (32) uses comp param 0
759  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t>>(
760  buffer, omnisci_column, parquet_column);
761  } else if (column_type.get_comp_param() == 16) {
762  return std::make_shared<ParquetFixedLengthEncoder<int16_t, int32_t>>(
763  buffer, omnisci_column, parquet_column);
764  } else {
765  UNREACHABLE();
766  }
767  }
768  } else if (column_type.get_compression() == kENCODING_NONE) { // for array types
769  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int64_t>>(
770  buffer, omnisci_column, parquet_column);
771  } else {
772  UNREACHABLE();
773  }
774  }
775  return {};
776 }
#define UNREACHABLE()
Definition: Logger.h:338
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 694 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_date_from_timestamp_encoder_with_types(), kENCODING_DATE_IN_DAYS, and UNREACHABLE.

Referenced by create_parquet_encoder().

698  {
699  auto column_type = omnisci_column->columnType;
700  if (parquet_column->logical_type()->is_timestamp() && column_type.is_date()) {
701  CHECK(column_type.get_compression() == kENCODING_DATE_IN_DAYS);
702  if (is_metadata_scan_or_for_import) {
703  if (column_type.get_comp_param() ==
704  0) { // DATE ENCODING FIXED (32) uses comp param 0
706  int64_t,
707  int32_t>(
708  omnisci_column, parquet_column, buffer, true);
709  } else if (column_type.get_comp_param() == 16) {
711  int64_t,
712  int16_t>(
713  omnisci_column, parquet_column, buffer, true);
714  } else {
715  UNREACHABLE();
716  }
717  } else {
718  if (column_type.get_comp_param() ==
719  0) { // DATE ENCODING FIXED (32) uses comp param 0
721  int64_t,
722  int32_t>(
723  omnisci_column, parquet_column, buffer, false);
724  } else if (column_type.get_comp_param() == 16) {
726  int64_t,
727  int16_t>(
728  omnisci_column, parquet_column, buffer, false);
729  } else {
730  UNREACHABLE();
731  }
732  }
733  }
734  return {};
735 }
#define UNREACHABLE()
Definition: Logger.h:338
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder_with_types(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 509 of file LazyParquetChunkLoader.cpp.

References heavydb.dtypes::T, and UNREACHABLE.

Referenced by create_parquet_date_from_timestamp_encoder().

513  {
514  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
515  parquet_column->logical_type().get())) {
516  switch (timestamp_logical_type->time_unit()) {
517  case parquet::LogicalType::TimeUnit::MILLIS:
518  if (is_metadata_scan_or_for_import) {
519  return std::make_shared<
520  ParquetDateInSecondsFromTimestampEncoder<V, T, 1000L, NullType>>(
521  buffer, omnisci_column, parquet_column);
522  }
523  return std::make_shared<
524  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L, NullType>>(
525  buffer, omnisci_column, parquet_column);
526  case parquet::LogicalType::TimeUnit::MICROS:
527  if (is_metadata_scan_or_for_import) {
528  return std::make_shared<
529  ParquetDateInSecondsFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
530  buffer, omnisci_column, parquet_column);
531  }
532  return std::make_shared<
533  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
534  buffer, omnisci_column, parquet_column);
535  case parquet::LogicalType::TimeUnit::NANOS:
536  if (is_metadata_scan_or_for_import) {
537  return std::make_shared<
539  T,
540  1000L * 1000L * 1000L,
541  NullType>>(
542  buffer, omnisci_column, parquet_column);
543  }
544  return std::make_shared<
545  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
546  buffer, omnisci_column, parquet_column);
547  default:
548  UNREACHABLE();
549  }
550  } else {
551  UNREACHABLE();
552  }
553  return {};
554 }
ParquetTimestampEncoder< V, T, conversion_denominator, NullType > ParquetDateInSecondsFromTimestampEncoder
#define UNREACHABLE()
Definition: Logger.h:338

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 172 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

176  {
177  if (parquet_column->logical_type()->is_decimal()) {
178  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
179  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int64_t>(
180  omnisci_column, parquet_column, buffer);
181  }
182  CHECK(omnisci_column->columnType.get_compression() == kENCODING_FIXED);
183  if (is_metadata_scan_or_for_import) {
184  switch (omnisci_column->columnType.get_comp_param()) {
185  case 16:
186  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int16_t>(
187  omnisci_column, parquet_column, buffer);
188  case 32:
189  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int32_t>(
190  omnisci_column, parquet_column, buffer);
191  default:
192  UNREACHABLE();
193  }
194  } else {
195  switch (omnisci_column->columnType.get_comp_param()) {
196  case 16:
197  return create_parquet_decimal_encoder_with_omnisci_type<int16_t, int16_t>(
198  omnisci_column, parquet_column, buffer);
199  case 32:
200  return create_parquet_decimal_encoder_with_omnisci_type<int32_t, int32_t>(
201  omnisci_column, parquet_column, buffer);
202  default:
203  UNREACHABLE();
204  }
205  }
206  }
207  return {};
208 }
#define UNREACHABLE()
Definition: Logger.h:338
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder_with_omnisci_type ( const ColumnDescriptor column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor,
AbstractBuffer buffer 
)

Definition at line 148 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

151  {
152  switch (parquet_column_descriptor->physical_type()) {
153  case parquet::Type::INT32:
154  return std::make_shared<ParquetDecimalEncoder<V, int32_t, NullType>>(
155  buffer, column_descriptor, parquet_column_descriptor);
156  case parquet::Type::INT64:
157  return std::make_shared<ParquetDecimalEncoder<V, int64_t, NullType>>(
158  buffer, column_descriptor, parquet_column_descriptor);
159  case parquet::Type::FIXED_LEN_BYTE_ARRAY:
160  return std::make_shared<
161  ParquetDecimalEncoder<V, parquet::FixedLenByteArray, NullType>>(
162  buffer, column_descriptor, parquet_column_descriptor);
163  case parquet::Type::BYTE_ARRAY:
164  return std::make_shared<ParquetDecimalEncoder<V, parquet::ByteArray, NullType>>(
165  buffer, column_descriptor, parquet_column_descriptor);
166  default:
167  UNREACHABLE();
168  }
169  return {};
170 }
#define UNREACHABLE()
Definition: Logger.h:338
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import,
const bool  is_for_detect,
const bool  geo_validate_geometry 
)

Create a Parquet specific encoder for a Parquet to OmniSci mapping.

Parameters
omnisci_column- the descriptor of OmniSci column
parquet_column- the descriptor of Parquet column
chunks- list of chunks to populate (the case of more than one chunk happens only if a logical column expands to multiple physical columns)
string_dictionary- string dictionary used in encoding for string dictionary encoded columns
chunk_metadata- similar to the list of chunks, a list of chunk metadata that is populated
is_metadata_scan- a flag indicating if the encoders created should be for a metadata scan
is_for_import- a flag indicating if the encoders created should be for import
Returns
An appropriate Parquet encoder for the use case defined by the Parquet to OmniSci mapping.

Notes:

  • In the case of a metadata scan, the type of the encoder created may significantly change (for example in bit width.) This is because it is common for OmniSci to store metadata in a different format altogether than the data itself (see for example FixedLengthEncoder.)
  • This function and the function isColumnMappingSupported work in conjunction with each other. For example, once a mapping is known to be allowed (since isColumnMappingSupported returned true) this function does not have to check many corner cases exhaustively as it would be redundant with what was checked in isColumnMappingSupported.

Definition at line 905 of file LazyParquetChunkLoader.cpp.

References CHECK, create_parquet_array_encoder(), create_parquet_date_encoder(), create_parquet_date_from_timestamp_encoder(), create_parquet_decimal_encoder(), create_parquet_floating_point_encoder(), create_parquet_geospatial_encoder(), create_parquet_integral_encoder(), create_parquet_none_type_encoder(), create_parquet_string_encoder(), create_parquet_time_encoder(), create_parquet_timestamp_encoder(), and UNREACHABLE.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), create_parquet_array_encoder(), create_parquet_encoder_for_import(), and create_parquet_encoder_for_metadata_scan().

914  {
915  CHECK(!(is_metadata_scan && is_for_import));
916  auto buffer = chunks.empty() ? nullptr : chunks.begin()->getBuffer();
917  if (auto encoder = create_parquet_geospatial_encoder(omnisci_column,
918  parquet_column,
919  chunks,
920  chunk_metadata,
921  is_metadata_scan,
922  is_for_import,
923  geo_validate_geometry)) {
924  return encoder;
925  }
926  if (auto encoder = create_parquet_array_encoder(omnisci_column,
927  parquet_column,
928  chunks,
929  string_dictionary,
930  chunk_metadata,
931  is_metadata_scan,
932  is_for_import,
933  is_for_detect,
934  geo_validate_geometry)) {
935  return encoder;
936  }
937  if (auto encoder = create_parquet_decimal_encoder(
938  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
939  return encoder;
940  }
941  if (auto encoder = create_parquet_integral_encoder(
942  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
943  return encoder;
944  }
945  if (auto encoder =
946  create_parquet_floating_point_encoder(omnisci_column, parquet_column, buffer)) {
947  return encoder;
948  }
949  if (auto encoder = create_parquet_timestamp_encoder(
950  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
951  return encoder;
952  }
953  if (auto encoder =
954  create_parquet_none_type_encoder(omnisci_column, parquet_column, buffer)) {
955  return encoder;
956  }
957  if (auto encoder = create_parquet_time_encoder(
958  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
959  return encoder;
960  }
962  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
963  return encoder;
964  }
965  if (auto encoder = create_parquet_date_encoder(
966  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
967  return encoder;
968  }
969  if (auto encoder = create_parquet_string_encoder(
970  omnisci_column,
971  parquet_column,
972  chunks.empty() ? Chunk_NS::Chunk{} : *chunks.begin(),
973  string_dictionary,
974  chunk_metadata,
975  is_for_import,
976  is_for_detect)) {
977  return encoder;
978  }
979  UNREACHABLE();
980  return {};
981 }
std::shared_ptr< ParquetEncoder > create_parquet_geospatial_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool geo_validate_geometry)
#define UNREACHABLE()
Definition: Logger.h:338
std::shared_ptr< ParquetEncoder > create_parquet_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_none_type_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
std::shared_ptr< ParquetEncoder > create_parquet_array_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
std::shared_ptr< ParquetEncoder > create_parquet_time_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_date_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_string_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import, const bool is_for_detect)
std::shared_ptr< ParquetEncoder > create_parquet_floating_point_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
#define CHECK(condition)
Definition: Logger.h:291
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_decimal_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_integral_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_import ( std::list< Chunk_NS::Chunk > &  chunks,
const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
StringDictionary string_dictionary,
const bool  geo_validate_geometry 
)

Intended to be used for the import case.

Definition at line 986 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_import().

991  {
992  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
993  return create_parquet_encoder(omnisci_column,
994  parquet_column,
995  chunks,
996  string_dictionary,
997  chunk_metadata,
998  false,
999  true,
1000  false,
1001  geo_validate_geometry);
1002 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_metadata_scan ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const bool  geo_validate_geometry 
)

Intended to be used only with metadata scan. Creates an incomplete encoder capable of updating metadata.

Definition at line 1008 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_metadata_scan().

1011  {
1012  std::list<Chunk_NS::Chunk> chunks;
1013  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
1014  return create_parquet_encoder(omnisci_column,
1015  parquet_column,
1016  chunks,
1017  nullptr,
1018  chunk_metadata,
1019  true,
1020  false,
1021  false,
1022  geo_validate_geometry);
1023 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect, const bool geo_validate_geometry)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_floating_point_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 428 of file LazyParquetChunkLoader.cpp.

References CHECK, CHECK_EQ, ColumnDescriptor::columnType, kDOUBLE, kENCODING_NONE, kFLOAT, and UNREACHABLE.

Referenced by create_parquet_encoder().

431  {
432  auto column_type = omnisci_column->columnType;
433  if (!column_type.is_fp()) {
434  return {};
435  }
436  CHECK_EQ(column_type.get_compression(), kENCODING_NONE);
437  switch (column_type.get_type()) {
438  case kFLOAT:
439  switch (parquet_column->physical_type()) {
440  case parquet::Type::FLOAT:
441  return std::make_shared<ParquetFixedLengthEncoder<float, float>>(
442  buffer, omnisci_column, parquet_column);
443  case parquet::Type::DOUBLE:
444  return std::make_shared<ParquetFixedLengthEncoder<float, double>>(
445  buffer, omnisci_column, parquet_column);
446  default:
447  UNREACHABLE();
448  }
449  case kDOUBLE:
450  CHECK(parquet_column->physical_type() == parquet::Type::DOUBLE);
451  return std::make_shared<ParquetFixedLengthEncoder<double, double>>(
452  buffer, omnisci_column, parquet_column);
453  default:
454  UNREACHABLE();
455  }
456  return {};
457 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_geospatial_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import,
const bool  geo_validate_geometry 
)

Definition at line 831 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and is_valid_parquet_string().

Referenced by create_parquet_encoder().

838  {
839  auto column_type = omnisci_column->columnType;
840  if (!is_valid_parquet_string(parquet_column) || !column_type.is_geometry()) {
841  return {};
842  }
843  if (is_for_import) {
844  return std::make_shared<ParquetGeospatialImportEncoder>(chunks,
845  geo_validate_geometry);
846  }
847  if (is_metadata_scan) {
848  return std::make_shared<ParquetGeospatialEncoder>(geo_validate_geometry);
849  }
850  for (auto chunks_iter = chunks.begin(); chunks_iter != chunks.end(); ++chunks_iter) {
851  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
852  auto& chunk_metadata_ptr = chunk_metadata.back();
853  chunk_metadata_ptr->sqlType = chunks_iter->getColumnDesc()->columnType;
854  }
855  return std::make_shared<ParquetGeospatialEncoder>(
856  parquet_column, chunks, chunk_metadata, geo_validate_geometry);
857 }
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 298 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, and UNREACHABLE.

Referenced by create_parquet_encoder().

302  {
303  auto column_type = omnisci_column->columnType;
304  auto physical_type = parquet_column->physical_type();
305 
306  int bit_width = -1;
307  int is_signed = false;
308  // handle the integral case with no Parquet annotation
309  if (parquet_column->logical_type()->is_none() && column_type.is_integer()) {
310  if (physical_type == parquet::Type::INT32) {
311  bit_width = 32;
312  } else if (physical_type == parquet::Type::INT64) {
313  bit_width = 64;
314  } else {
315  UNREACHABLE();
316  }
317  is_signed = true;
318  }
319  // handle the integral case with Parquet annotation
320  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
321  parquet_column->logical_type().get())) {
322  bit_width = int_logical_column->bit_width();
323  is_signed = int_logical_column->is_signed();
324  }
325 
326  if (bit_width == -1) { // no valid logical type (with or without annotation) found
327  return {};
328  }
329 
330  const size_t omnisci_data_type_byte_size = column_type.get_size();
331  const size_t parquet_data_type_byte_size = parquet::GetTypeByteSize(physical_type);
332 
333  switch (omnisci_data_type_byte_size) {
334  case 8:
335  CHECK(column_type.get_compression() == kENCODING_NONE);
336  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int64_t>(
337  buffer,
338  omnisci_data_type_byte_size,
339  parquet_data_type_byte_size,
340  bit_width,
341  is_signed);
342  case 4:
343  if (is_metadata_scan_or_for_import && column_type.get_type() == kBIGINT) {
344  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int32_t>(
345  buffer,
346  omnisci_data_type_byte_size,
347  parquet_data_type_byte_size,
348  bit_width,
349  is_signed);
350  }
351  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int32_t>(
352  buffer,
353  omnisci_data_type_byte_size,
354  parquet_data_type_byte_size,
355  bit_width,
356  is_signed);
357  case 2:
358  if (is_metadata_scan_or_for_import) {
359  switch (column_type.get_type()) {
360  case kBIGINT:
361  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int16_t>(
362  buffer,
363  omnisci_data_type_byte_size,
364  parquet_data_type_byte_size,
365  bit_width,
366  is_signed);
367  case kINT:
368  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int16_t>(
369  buffer,
370  omnisci_data_type_byte_size,
371  parquet_data_type_byte_size,
372  bit_width,
373  is_signed);
374  case kSMALLINT:
375  break;
376  default:
377  UNREACHABLE();
378  }
379  }
380  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int16_t>(
381  buffer,
382  omnisci_data_type_byte_size,
383  parquet_data_type_byte_size,
384  bit_width,
385  is_signed);
386  case 1:
387  if (is_metadata_scan_or_for_import) {
388  switch (column_type.get_type()) {
389  case kBIGINT:
390  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int8_t>(
391  buffer,
392  omnisci_data_type_byte_size,
393  parquet_data_type_byte_size,
394  bit_width,
395  is_signed);
396  case kINT:
397  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int8_t>(
398  buffer,
399  omnisci_data_type_byte_size,
400  parquet_data_type_byte_size,
401  bit_width,
402  is_signed);
403  case kSMALLINT:
404  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int8_t>(
405  buffer,
406  omnisci_data_type_byte_size,
407  parquet_data_type_byte_size,
408  bit_width,
409  is_signed);
410  case kTINYINT:
411  break;
412  default:
413  UNREACHABLE();
414  }
415  }
416  return create_parquet_integral_encoder_with_omnisci_type<int8_t, int8_t>(
417  buffer,
418  omnisci_data_type_byte_size,
419  parquet_data_type_byte_size,
420  bit_width,
421  is_signed);
422  default:
423  UNREACHABLE();
424  }
425  return {};
426 }
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqltypes.h:72
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder_with_omnisci_type ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const int  bit_width,
const bool  is_signed 
)

Create a integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
bit_width- bit width specified for the Parquet column
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated type V and NullType.

Note, this function determines the appropriate bit depth integral encoder to create, while create_parquet_signed_or_unsigned_integral_encoder_with_types determines whether to create a signed or unsigned integral encoder.

Definition at line 261 of file LazyParquetChunkLoader.cpp.

References create_parquet_signed_or_unsigned_integral_encoder_with_types(), and UNREACHABLE.

266  {
267  switch (bit_width) {
268  case 8:
270  int32_t,
271  uint8_t,
272  NullType>(
273  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
274  case 16:
276  int32_t,
277  uint16_t,
278  NullType>(
279  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
280  case 32:
282  int32_t,
283  uint32_t,
284  NullType>(
285  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
286  case 64:
288  int64_t,
289  uint64_t,
290  NullType>(
291  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
292  default:
293  UNREACHABLE();
294  }
295  return {};
296 }
std::shared_ptr< ParquetEncoder > create_parquet_signed_or_unsigned_integral_encoder_with_types(AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
Create a signed or unsigned integral parquet encoder using types.
#define UNREACHABLE()
Definition: Logger.h:338

+ Here is the call graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_none_type_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 459 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_string(), kBOOLEAN, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

462  {
463  auto column_type = omnisci_column->columnType;
464  if (parquet_column->logical_type()->is_none() &&
465  !omnisci_column->columnType.is_string()) { // boolean
466  if (column_type.get_compression() == kENCODING_NONE) {
467  switch (column_type.get_type()) {
468  case kBOOLEAN:
469  return std::make_shared<ParquetFixedLengthEncoder<int8_t, bool>>(
470  buffer, omnisci_column, parquet_column);
471  default:
472  UNREACHABLE();
473  }
474  } else {
475  UNREACHABLE();
476  }
477  }
478  return {};
479 }
#define UNREACHABLE()
Definition: Logger.h:338
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:561

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename U , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_signed_or_unsigned_integral_encoder_with_types ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const bool  is_signed 
)

Create a signed or unsigned integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated types V, T, U, and NullType.

Definition at line 226 of file LazyParquetChunkLoader.cpp.

References CHECK.

Referenced by create_parquet_integral_encoder_with_omnisci_type().

230  {
231  CHECK(sizeof(NullType) == omnisci_data_type_byte_size);
232  if (is_signed) {
233  return std::make_shared<ParquetFixedLengthEncoder<V, T, NullType>>(
234  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
235  } else {
236  return std::make_shared<ParquetUnsignedFixedLengthEncoder<V, T, U, NullType>>(
237  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
238  }
239 }
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_string_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const Chunk_NS::Chunk chunk,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
bool  is_for_import,
const bool  is_for_detect 
)

Definition at line 778 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, Chunk_NS::Chunk::getBuffer(), Chunk_NS::Chunk::getIndexBuf(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

785  {
786  auto column_type = omnisci_column->columnType;
787  if (!is_valid_parquet_string(parquet_column) ||
788  !omnisci_column->columnType.is_string()) {
789  return {};
790  }
791  if (column_type.get_compression() == kENCODING_NONE) {
792  if (is_for_import) {
793  return std::make_shared<ParquetStringImportEncoder>(chunk.getBuffer());
794  } else {
795  return std::make_shared<ParquetStringNoneEncoder>(chunk.getBuffer(),
796  chunk.getIndexBuf());
797  }
798  } else if (column_type.get_compression() == kENCODING_DICT) {
799  if (!is_for_detect) { // non-detect use case
800  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
801  std::unique_ptr<ChunkMetadata>& logical_chunk_metadata = chunk_metadata.back();
802  logical_chunk_metadata->sqlType = omnisci_column->columnType;
803  switch (column_type.get_size()) {
804  case 1:
805  return std::make_shared<ParquetStringEncoder<uint8_t>>(
806  chunk.getBuffer(),
807  string_dictionary,
808  is_for_import ? nullptr : logical_chunk_metadata.get());
809  case 2:
810  return std::make_shared<ParquetStringEncoder<uint16_t>>(
811  chunk.getBuffer(),
812  string_dictionary,
813  is_for_import ? nullptr : logical_chunk_metadata.get());
814  case 4:
815  return std::make_shared<ParquetStringEncoder<int32_t>>(
816  chunk.getBuffer(),
817  string_dictionary,
818  is_for_import ? nullptr : logical_chunk_metadata.get());
819  default:
820  UNREACHABLE();
821  }
822  } else { // detect use-case
823  return std::make_shared<ParquetDetectStringEncoder>(chunk.getBuffer());
824  }
825  } else {
826  UNREACHABLE();
827  }
828  return {};
829 }
AbstractBuffer * getIndexBuf() const
Definition: Chunk.h:148
#define UNREACHABLE()
Definition: Logger.h:338
AbstractBuffer * getBuffer() const
Definition: Chunk.h:146
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:561

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 641 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

645  {
646  auto column_type = omnisci_column->columnType;
647  if (auto time_logical_column = dynamic_cast<const parquet::TimeLogicalType*>(
648  parquet_column->logical_type().get())) {
649  if (column_type.get_compression() == kENCODING_NONE) {
650  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
651  return create_parquet_time_encoder_with_types<int64_t, int32_t, int64_t>(
652  omnisci_column, parquet_column, buffer);
653  } else {
654  return create_parquet_time_encoder_with_types<int64_t, int64_t, int64_t>(
655  omnisci_column, parquet_column, buffer);
656  }
657  } else if (column_type.get_compression() == kENCODING_FIXED) {
658  if (is_metadata_scan_or_for_import) {
659  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
660  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
661  return create_parquet_time_encoder_with_types<int64_t, int32_t, int32_t>(
662  omnisci_column, parquet_column, buffer);
663  } else {
664  CHECK(time_logical_column->time_unit() ==
665  parquet::LogicalType::TimeUnit::MICROS ||
666  time_logical_column->time_unit() ==
667  parquet::LogicalType::TimeUnit::NANOS);
668  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
669  return create_parquet_time_encoder_with_types<int64_t, int64_t, int32_t>(
670  omnisci_column, parquet_column, buffer);
671  }
672  } else {
673  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
674  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
675  return create_parquet_time_encoder_with_types<int32_t, int32_t, int32_t>(
676  omnisci_column, parquet_column, buffer);
677  } else {
678  CHECK(time_logical_column->time_unit() ==
679  parquet::LogicalType::TimeUnit::MICROS ||
680  time_logical_column->time_unit() ==
681  parquet::LogicalType::TimeUnit::NANOS);
682  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
683  return create_parquet_time_encoder_with_types<int32_t, int64_t, int32_t>(
684  omnisci_column, parquet_column, buffer);
685  }
686  }
687  } else {
688  UNREACHABLE();
689  }
690  }
691  return {};
692 }
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 615 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

618  {
619  if (auto time_logical_type = dynamic_cast<const parquet::TimeLogicalType*>(
620  parquet_column->logical_type().get())) {
621  switch (time_logical_type->time_unit()) {
622  case parquet::LogicalType::TimeUnit::MILLIS:
623  return std::make_shared<ParquetTimeEncoder<V, T, 1000L, NullType>>(
624  buffer, omnisci_column, parquet_column);
625  case parquet::LogicalType::TimeUnit::MICROS:
626  return std::make_shared<ParquetTimeEncoder<V, T, 1000L * 1000L, NullType>>(
627  buffer, omnisci_column, parquet_column);
628  case parquet::LogicalType::TimeUnit::NANOS:
629  return std::make_shared<
630  ParquetTimeEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
631  buffer, omnisci_column, parquet_column);
632  default:
633  UNREACHABLE();
634  }
635  } else {
636  UNREACHABLE();
637  }
638  return {};
639 }
#define UNREACHABLE()
Definition: Logger.h:338
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 556 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_precision(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

560  {
561  auto column_type = omnisci_column->columnType;
562  auto precision = column_type.get_precision();
563  if (parquet_column->logical_type()->is_timestamp()) {
564  if (column_type.get_compression() == kENCODING_NONE) {
565  if (precision == 0) {
566  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int64_t>(
567  omnisci_column, parquet_column, buffer);
568  } else {
569  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
570  buffer, omnisci_column, parquet_column);
571  }
572  } else if (column_type.get_compression() == kENCODING_FIXED) {
573  CHECK(column_type.get_comp_param() == 32);
574  if (is_metadata_scan_or_for_import) {
575  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int32_t>(
576  omnisci_column, parquet_column, buffer);
577  } else {
578  return create_parquet_timestamp_encoder_with_types<int32_t, int64_t, int32_t>(
579  omnisci_column, parquet_column, buffer);
580  }
581  }
582  } else if (parquet_column->logical_type()->is_none() && column_type.is_timestamp()) {
583  if (parquet_column->physical_type() == parquet::Type::INT32) {
584  CHECK(column_type.get_compression() == kENCODING_FIXED &&
585  column_type.get_comp_param() == 32);
586  if (is_metadata_scan_or_for_import) {
587  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int32_t, int32_t>>(
588  buffer, omnisci_column, parquet_column);
589  } else {
590  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t, int32_t>>(
591  buffer, omnisci_column, parquet_column);
592  }
593  } else if (parquet_column->physical_type() == parquet::Type::INT64) {
594  if (column_type.get_compression() == kENCODING_NONE) {
595  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
596  buffer, omnisci_column, parquet_column);
597  } else if (column_type.get_compression() == kENCODING_FIXED) {
598  CHECK(column_type.get_comp_param() == 32);
599  if (is_metadata_scan_or_for_import) {
600  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int32_t>>(
601  buffer, omnisci_column, parquet_column);
602  } else {
603  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int64_t, int32_t>>(
604  buffer, omnisci_column, parquet_column);
605  }
606  }
607  } else {
608  UNREACHABLE();
609  }
610  }
611  return {};
612 }
#define UNREACHABLE()
Definition: Logger.h:338
int get_precision() const
Definition: sqltypes.h:394
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 482 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

485  {
486  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
487  parquet_column->logical_type().get())) {
488  switch (timestamp_logical_type->time_unit()) {
489  case parquet::LogicalType::TimeUnit::MILLIS:
490  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L, NullType>>(
491  buffer, omnisci_column, parquet_column);
492  case parquet::LogicalType::TimeUnit::MICROS:
493  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
494  buffer, omnisci_column, parquet_column);
495  case parquet::LogicalType::TimeUnit::NANOS:
496  return std::make_shared<
497  ParquetTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
498  buffer, omnisci_column, parquet_column);
499  default:
500  UNREACHABLE();
501  }
502  } else {
503  UNREACHABLE();
504  }
505  return {};
506 }
#define UNREACHABLE()
Definition: Logger.h:338
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1333 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1333  {
1334  return omnisci_column->columnType.get_dimension() == 6;
1335 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1337 of file LazyParquetChunkLoader.cpp.

1338  {
1339  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MICROS;
1340 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1342 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1342  {
1343  return omnisci_column->columnType.get_dimension() == 3;
1344 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1346 of file LazyParquetChunkLoader.cpp.

1347  {
1348  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS;
1349 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1324 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1324  {
1325  return omnisci_column->columnType.get_dimension() == 9;
1326 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1328 of file LazyParquetChunkLoader.cpp.

1329  {
1330  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::NANOS;
1331 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_list_column ( const parquet::ColumnDescriptor *  parquet_column)

Detect a valid list parquet column.

Parameters
parquet_column- the parquet column descriptor of the column to detect
Returns
true if it is a valid parquet list column

Note: the notion of a valid parquet list column is adapted from the parquet schema specification for logical type definitions:

<list-repetition> group <name> (LIST) { repeated group list { <element-repetition> <element-type> element; } }

Testing has shown that there are small deviations from this specification in at least one library– pyarrow– where the innermost schema node is named "item" as opposed to "element".

The following is also true of the schema definition.

  • The outer-most level must be a group annotated with LIST that contains a single field named list. The repetition of this level must be either optional or required and determines whether the list is nullable.
  • The middle level, named list, must be a repeated group with a single field named element.
  • The element field encodes the list's element type and repetition. Element repetition must be required or optional.

FSI further restricts lists to be defined only at the top level, meaning directly below the root schema node.

Definition at line 102 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_array_encoder(), set_definition_levels_for_zero_max_definition_level_case(), foreign_storage::LazyParquetChunkLoader::suggestColumnMapping(), validate_allowed_mapping(), validate_column_mapping_and_row_group_metadata(), validate_list_column_metadata_statistics(), and validate_max_repetition_and_definition_level().

102  {
103  const parquet::schema::Node* node = parquet_column->schema_node().get();
104  if ((node->name() != "element" && node->name() != "item") ||
105  !(node->is_required() ||
106  node->is_optional())) { // ensure first innermost node is named "element"
107  // which is required by the parquet specification;
108  // however testing shows that pyarrow generates this
109  // column with the name of "item"
110  // this field must be either required or optional
111  return false;
112  }
113  node = node->parent();
114  if (!node) { // required nested structure
115  return false;
116  }
117  if (node->name() != "list" || !node->is_repeated() ||
118  !node->is_group()) { // ensure second innermost node is named "list" which is
119  // a repeated group; this is
120  // required by the parquet specification
121  return false;
122  }
123  node = node->parent();
124  if (!node) { // required nested structure
125  return false;
126  }
127  if (!node->logical_type()->is_list() ||
128  !(node->is_optional() ||
129  node->is_required())) { // ensure third outermost node has logical type LIST
130  // which is either optional or required; this is required
131  // by the parquet specification
132  return false;
133  }
134  node =
135  node->parent(); // this must now be the root node of schema which is required by
136  // FSI (lists can not be embedded into a deeper nested structure)
137  if (!node) { // required nested structure
138  return false;
139  }
140  node = node->parent();
141  if (node) { // implies the previous node was not the root node
142  return false;
143  }
144  return true;
145 }

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_string ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 60 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_geospatial_encoder(), create_parquet_string_encoder(), suggest_column_scalar_type(), suggest_string_mapping(), validate_geospatial_mapping(), and validate_string_mapping().

60  {
61  return (parquet_column->logical_type()->is_none() &&
62  parquet_column->physical_type() == parquet::Type::BYTE_ARRAY) ||
63  parquet_column->logical_type()->is_string();
64 }

+ Here is the caller graph for this function:

std::list<RowGroupMetadata> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::metadata_scan_rowgroup_interval ( const std::map< int, std::shared_ptr< ParquetEncoder >> &  encoder_map,
const RowGroupInterval &  row_group_interval,
const ReaderPtr &  reader,
const ForeignTableSchema &  schema 
)

Definition at line 1719 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnId, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getLogicalAndPhysicalColumns(), foreign_storage::ForeignTableSchema::getLogicalColumn(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::RowGroupInterval::start_index.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1723  {
1724  std::list<RowGroupMetadata> row_group_metadata;
1725  auto column_interval =
1726  Interval<ColumnType>{schema.getLogicalAndPhysicalColumns().front()->columnId,
1727  schema.getLogicalAndPhysicalColumns().back()->columnId};
1728 
1729  auto file_metadata = reader->parquet_reader()->metadata();
1730  for (int row_group = row_group_interval.start_index;
1731  row_group <= row_group_interval.end_index;
1732  ++row_group) {
1733  auto& row_group_metadata_item = row_group_metadata.emplace_back();
1734  row_group_metadata_item.row_group_index = row_group;
1735  row_group_metadata_item.file_path = row_group_interval.file_path;
1736 
1737  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1738  file_metadata->RowGroup(row_group);
1739 
1740  for (int column_id = column_interval.start; column_id <= column_interval.end;
1741  column_id++) {
1742  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1743  auto parquet_column_index = schema.getParquetColumnIndex(column_id);
1744  auto encoder_map_iter =
1745  encoder_map.find(schema.getLogicalColumn(column_id)->columnId);
1746  CHECK(encoder_map_iter != encoder_map.end());
1747  try {
1748  auto metadata = encoder_map_iter->second->getRowGroupMetadata(
1749  group_metadata.get(), parquet_column_index, column_descriptor->columnType);
1750  row_group_metadata_item.column_chunk_metadata.emplace_back(metadata);
1751  } catch (const std::exception& e) {
1752  std::stringstream error_message;
1753  error_message << e.what() << " in row group " << row_group << " of Parquet file '"
1754  << row_group_interval.file_path << "'.";
1755  throw std::runtime_error(error_message.str());
1756  }
1757  }
1758  }
1759  return row_group_metadata;
1760 }
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_import ( const std::map< int, Chunk_NS::Chunk chunks,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const std::map< int, StringDictionary * >  column_dictionaries,
const int64_t  num_rows,
const bool  geo_validate_geometry 
)

Definition at line 1762 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_import(), shared::get_from_map(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), and foreign_storage::ForeignTableSchema::getParquetColumnIndex().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups().

1768  {
1769  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1770  auto file_metadata = reader->parquet_reader()->metadata();
1771  for (auto& [column_id, chunk] : chunks) {
1772  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1773  if (column_descriptor->isGeoPhyCol) { // skip physical columns
1774  continue;
1775  }
1776  auto parquet_column_descriptor =
1777  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1778  auto find_it = column_dictionaries.find(column_id);
1779  StringDictionary* dictionary =
1780  (find_it == column_dictionaries.end() ? nullptr : find_it->second);
1781  std::list<Chunk_NS::Chunk> chunks_for_import;
1782  chunks_for_import.push_back(chunk);
1783  if (column_descriptor->columnType.is_geometry()) {
1784  for (int i = 0; i < column_descriptor->columnType.get_physical_cols(); ++i) {
1785  chunks_for_import.push_back(chunks.at(column_id + i + 1));
1786  }
1787  }
1788  encoder_map[column_id] = create_parquet_encoder_for_import(chunks_for_import,
1789  column_descriptor,
1790  parquet_column_descriptor,
1791  dictionary,
1792  geo_validate_geometry);
1793 
1794  // reserve space in buffer when num-elements known ahead of time for types
1795  // of known size (for example dictionary encoded strings)
1796  auto encoder = shared::get_from_map(encoder_map, column_id);
1797  if (auto inplace_encoder = dynamic_cast<ParquetInPlaceEncoder*>(encoder.get())) {
1798  inplace_encoder->reserve(num_rows);
1799  }
1800  }
1801  return encoder_map;
1802 }
V & get_from_map(std::map< K, V, comp > &map, const K &key)
Definition: misc.h:62
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_import(std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary, const bool geo_validate_geometry)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_metadata_scan ( const Interval< ColumnType > &  column_interval,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const bool  do_metadata_stats_validation,
const bool  geo_validate_geometry 
)

Definition at line 1804 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_metadata_scan(), foreign_storage::Interval< T >::end, shared::get_from_map(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::Interval< T >::start.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1809  {
1810  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1811  auto file_metadata = reader->parquet_reader()->metadata();
1812  for (int column_id = column_interval.start; column_id <= column_interval.end;
1813  column_id++) {
1814  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1815  auto parquet_column_descriptor =
1816  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1817  encoder_map[column_id] = create_parquet_encoder_for_metadata_scan(
1818  column_descriptor, parquet_column_descriptor, geo_validate_geometry);
1819  if (!do_metadata_stats_validation) {
1820  shared::get_from_map(encoder_map, column_id)->disableMetadataStatsValidation();
1821  }
1822  column_id += column_descriptor->columnType.get_physical_cols();
1823  }
1824  return encoder_map;
1825 }
T const end
Definition: Intervals.h:68
V & get_from_map(std::map< K, V, comp > &map, const K &key)
Definition: misc.h:62
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_metadata_scan(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const bool geo_validate_geometry)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::resize_values_buffer ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::vector< int8_t > &  values 
)

Definition at line 1171 of file LazyParquetChunkLoader.cpp.

References foreign_storage::LazyParquetChunkLoader::batch_reader_num_elements, ColumnDescriptor::columnType, and SQLTypeInfo::get_size().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1173  {
1174  auto max_type_byte_size =
1175  std::max(omnisci_column->columnType.get_size(),
1176  parquet::GetTypeByteSize(parquet_column->physical_type()));
1177  size_t values_size =
1178  LazyParquetChunkLoader::batch_reader_num_elements * max_type_byte_size;
1179  values.resize(values_size);
1180 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::set_definition_levels_for_zero_max_definition_level_case ( const parquet::ColumnDescriptor *  parquet_column_descriptor,
std::vector< int16_t > &  def_levels 
)

This function sets the definition levels to 1 for all read values in the case of required scalar/flat columns. The definition level of one informs all subsequent calls to parquet encoders to treat the read data as not null.

Definition at line 1114 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_list_column().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups().

1116  {
1117  if (!is_valid_parquet_list_column(parquet_column_descriptor) &&
1118  parquet_column_descriptor->max_definition_level() == 0) {
1119  if (!parquet_column_descriptor->schema_node()->is_required()) {
1120  throw std::runtime_error(
1121  "Unsupported parquet column detected. Column '" +
1122  parquet_column_descriptor->path()->ToDotString() +
1123  "' detected to have max definition level of 0 but is optional.");
1124  }
1125  def_levels.assign(def_levels.size(), 1);
1126  }
1127 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_boolean_type_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1360 of file LazyParquetChunkLoader.cpp.

References kBOOLEAN, kENCODING_NONE, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1361  {
1362  SQLTypeInfo type;
1364  type.set_type(kBOOLEAN);
1365  type.set_fixed_size();
1366  return type;
1367 }
void set_compression(EncodingType c)
Definition: sqltypes.h:481
void set_fixed_size()
Definition: sqltypes.h:479
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:470

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_column_scalar_type ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1568 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_string(), suggest_boolean_type_mapping(), suggest_date_mapping(), suggest_decimal_mapping(), suggest_floating_point_mapping(), suggest_integral_mapping(), suggest_string_mapping(), suggest_time_mapping(), and suggest_timestamp_mapping().

Referenced by foreign_storage::LazyParquetChunkLoader::suggestColumnMapping().

1568  {
1569  // decimal case
1570  if (parquet_column->logical_type()->is_decimal()) {
1571  return suggest_decimal_mapping(parquet_column);
1572  }
1573  // float case
1574  if (parquet_column->logical_type()->is_none() &&
1575  (parquet_column->physical_type() == parquet::Type::FLOAT ||
1576  parquet_column->physical_type() == parquet::Type::DOUBLE)) {
1577  return suggest_floating_point_mapping(parquet_column);
1578  }
1579  // integral case
1580  if ((parquet_column->logical_type()->is_none() &&
1581  (parquet_column->physical_type() == parquet::Type::INT32 ||
1582  parquet_column->physical_type() == parquet::Type::INT64)) ||
1583  parquet_column->logical_type()->is_int()) {
1584  return suggest_integral_mapping(parquet_column);
1585  }
1586  // boolean case
1587  if (parquet_column->logical_type()->is_none() &&
1588  parquet_column->physical_type() == parquet::Type::BOOLEAN) {
1589  return suggest_boolean_type_mapping(parquet_column);
1590  }
1591  // timestamp case
1592  if (parquet_column->logical_type()->is_timestamp()) {
1593  return suggest_timestamp_mapping(parquet_column);
1594  }
1595  // time case
1596  if (parquet_column->logical_type()->is_time()) {
1597  return suggest_time_mapping(parquet_column);
1598  }
1599  // date case
1600  if (parquet_column->logical_type()->is_date()) {
1601  return suggest_date_mapping(parquet_column);
1602  }
1603  // string case
1604  if (is_valid_parquet_string(parquet_column)) {
1605  return suggest_string_mapping(parquet_column);
1606  }
1607 
1608  throw ForeignStorageException("Unsupported data type detected for column: " +
1609  parquet_column->ToString());
1610 }
SQLTypeInfo suggest_decimal_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_timestamp_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_string_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_date_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_floating_point_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_integral_mapping(const parquet::ColumnDescriptor *parquet_column)
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_boolean_type_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_time_mapping(const parquet::ColumnDescriptor *parquet_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_date_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1464 of file LazyParquetChunkLoader.cpp.

References CHECK, kDATE, kENCODING_NONE, and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1464  {
1465  CHECK(parquet_column->logical_type()->is_date());
1466  SQLTypeInfo type;
1467  type.set_type(kDATE);
1468  type.set_compression(kENCODING_NONE);
1469  type.set_fixed_size();
1470  return type;
1471 }
Definition: sqltypes.h:80
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_decimal_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1196 of file LazyParquetChunkLoader.cpp.

References kDECIMAL, kENCODING_NONE, sql_constants::kMaxNumericPrecision, SQLTypeInfo::scale, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_precision(), SQLTypeInfo::set_scale(), SQLTypeInfo::set_type(), to_string(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1196  {
1197  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1198  parquet_column->logical_type().get())) {
1199  auto parquet_precision = decimal_logical_column->precision();
1200  auto parquet_scale = decimal_logical_column->scale();
1201  if (parquet_precision > sql_constants::kMaxNumericPrecision) {
1202  throw ForeignStorageException(
1203  "Parquet column \"" + parquet_column->ToString() +
1204  "\" has decimal precision of " + std::to_string(parquet_precision) +
1205  " which is too high to import, maximum precision supported is " +
1207  }
1208  SQLTypeInfo type;
1209  type.set_type(kDECIMAL);
1211  type.set_precision(parquet_precision);
1212  type.set_scale(parquet_scale);
1213  type.set_fixed_size();
1214  return type;
1215  }
1216  UNREACHABLE()
1217  << " a Parquet column's decimal logical type failed to be read appropriately";
1218  return {};
1219 }
void set_compression(EncodingType c)
Definition: sqltypes.h:481
static constexpr int32_t kMaxNumericPrecision
Definition: sqltypes.h:58
#define UNREACHABLE()
Definition: Logger.h:338
std::string to_string(char const *&&v)
void set_fixed_size()
Definition: sqltypes.h:479
void set_scale(int s)
Definition: sqltypes.h:475
void set_precision(int d)
Definition: sqltypes.h:473
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:470

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_floating_point_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1237 of file LazyParquetChunkLoader.cpp.

References kDOUBLE, kENCODING_NONE, kFLOAT, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1238  {
1239  SQLTypeInfo type;
1240  if (parquet_column->physical_type() == parquet::Type::FLOAT) {
1241  type.set_type(kFLOAT);
1242  } else if (parquet_column->physical_type() == parquet::Type::DOUBLE) {
1243  type.set_type(kDOUBLE);
1244  } else {
1245  UNREACHABLE();
1246  }
1248  type.set_fixed_size();
1249  return type;
1250 }
void set_compression(EncodingType c)
Definition: sqltypes.h:481
#define UNREACHABLE()
Definition: Logger.h:338
void set_fixed_size()
Definition: sqltypes.h:479
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:470

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_integral_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1279 of file LazyParquetChunkLoader.cpp.

References CHECK, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), run_benchmark_import::type, and within_range().

Referenced by suggest_column_scalar_type().

1279  {
1280  SQLTypeInfo type;
1282  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1283  parquet_column->logical_type().get())) {
1284  auto bit_width = int_logical_column->bit_width();
1285  if (!int_logical_column->is_signed()) {
1286  if (within_range(33, 64, bit_width)) {
1287  throw ForeignStorageException(
1288  "Unsigned integer column \"" + parquet_column->path()->ToDotString() +
1289  "\" in Parquet file with 64 bit-width has no supported type for ingestion "
1290  "that will not result in data loss");
1291  } else if (within_range(17, 32, bit_width)) {
1292  type.set_type(kBIGINT);
1293  } else if (within_range(9, 16, bit_width)) {
1294  type.set_type(kINT);
1295  } else if (within_range(0, 8, bit_width)) {
1296  type.set_type(kSMALLINT);
1297  }
1298  } else {
1299  if (within_range(33, 64, bit_width)) {
1300  type.set_type(kBIGINT);
1301  } else if (within_range(17, 32, bit_width)) {
1302  type.set_type(kINT);
1303  } else if (within_range(9, 16, bit_width)) {
1304  type.set_type(kSMALLINT);
1305  } else if (within_range(0, 8, bit_width)) {
1306  type.set_type(kTINYINT);
1307  }
1308  }
1309  type.set_fixed_size();
1310  return type;
1311  }
1312 
1313  CHECK(parquet_column->logical_type()->is_none());
1314  if (parquet_column->physical_type() == parquet::Type::INT32) {
1315  type.set_type(kINT);
1316  } else {
1317  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
1318  type.set_type(kBIGINT);
1319  }
1320  type.set_fixed_size();
1321  return type;
1322 }
void set_compression(EncodingType c)
Definition: sqltypes.h:481
void set_fixed_size()
Definition: sqltypes.h:479
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqltypes.h:72
bool within_range(int64_t lower_bound, int64_t upper_bound, int64_t value)
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:470

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_string_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1481 of file LazyParquetChunkLoader.cpp.

References CHECK, is_valid_parquet_string(), kENCODING_DICT, kTEXT, SQLTypeInfo::set_comp_param(), SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1481  {
1482  CHECK(is_valid_parquet_string(parquet_column));
1483  SQLTypeInfo type;
1484  type.set_type(kTEXT);
1486  type.set_comp_param(0); // `comp_param` is expected either to be zero or
1487  // equal to a string dictionary id in some code
1488  // paths, since we don't have a string dictionary we
1489  // set this to zero
1490  type.set_fixed_size();
1491  return type;
1492 }
void set_compression(EncodingType c)
Definition: sqltypes.h:481
void set_fixed_size()
Definition: sqltypes.h:479
void set_comp_param(int p)
Definition: sqltypes.h:482
Definition: sqltypes.h:79
#define CHECK(condition)
Definition: Logger.h:291
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:470

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_time_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1438 of file LazyParquetChunkLoader.cpp.

References CHECK, kENCODING_NONE, kTIME, and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1438  {
1439  CHECK(parquet_column->logical_type()->is_time());
1440  SQLTypeInfo type;
1441  type.set_type(kTIME);
1442  type.set_compression(kENCODING_NONE);
1443  type.set_fixed_size();
1444  return type;
1445 }
Definition: sqltypes.h:76
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_timestamp_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1404 of file LazyParquetChunkLoader.cpp.

References is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_NONE, kTIMESTAMP, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_precision(), SQLTypeInfo::set_type(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1404  {
1405  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1406  parquet_column->logical_type().get())) {
1407  SQLTypeInfo type;
1408  type.set_type(kTIMESTAMP);
1410  if (is_nanosecond_precision(timestamp_logical_column)) {
1411  type.set_precision(9);
1412  } else if (is_microsecond_precision(timestamp_logical_column)) {
1413  type.set_precision(6);
1414  } else if (is_millisecond_precision(timestamp_logical_column)) {
1415  type.set_precision(3);
1416  }
1417  type.set_fixed_size();
1418  return type;
1419  }
1420  UNREACHABLE();
1421  return {};
1422 }
void set_compression(EncodingType c)
Definition: sqltypes.h:481
#define UNREACHABLE()
Definition: Logger.h:338
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
void set_fixed_size()
Definition: sqltypes.h:479
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
void set_precision(int d)
Definition: sqltypes.h:473
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:470

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_missing_metadata_error ( const int  row_group_index,
const int  column_index,
const std::string &  file_path 
)

Definition at line 1622 of file LazyParquetChunkLoader.cpp.

References to_string().

Referenced by validate_column_mapping_and_row_group_metadata().

1624  {
1625  throw std::runtime_error{
1626  "Statistics metadata is required for all row groups. Metadata is missing for "
1627  "row group index: " +
1628  std::to_string(row_group_index) +
1629  ", column index: " + std::to_string(column_index) + ", file path: " + file_path};
1630 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_row_group_larger_than_fragment_size_error ( const MaxRowGroupSizeStats  max_row_group_stats,
const int  fragment_size 
)

Definition at line 1638 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::file_path, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::max_row_group_index, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::max_row_group_size, and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1640  {
1641  auto metadata_scan_exception = MetadataScanInfeasibleFragmentSizeException{
1642  "Parquet file has a row group size that is larger than the fragment size. "
1643  "Please set the table fragment size to a number that is larger than the "
1644  "row group size. Row group index: " +
1645  std::to_string(max_row_group_stats.max_row_group_index) +
1646  ", row group size: " + std::to_string(max_row_group_stats.max_row_group_size) +
1647  ", fragment size: " + std::to_string(fragment_size) +
1648  ", file path: " + max_row_group_stats.file_path};
1649  metadata_scan_exception.min_feasible_fragment_size_ =
1650  max_row_group_stats.max_row_group_size;
1651  throw metadata_scan_exception;
1652 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping ( const parquet::ColumnDescriptor *  parquet_column,
const ColumnDescriptor omnisci_column 
)

Definition at line 1526 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnName, ColumnDescriptor::columnType, foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::get_type_name(), SQLTypeInfo::is_array(), is_valid_parquet_list_column(), foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported(), LOG, run_benchmark_import::type, validate_max_repetition_and_definition_level(), and logger::WARNING.

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_column_mapping_and_row_group_metadata().

1527  {
1528  validate_max_repetition_and_definition_level(omnisci_column, parquet_column);
1529  bool allowed_type = false;
1530  if (omnisci_column->columnType.is_array()) {
1531  if (is_valid_parquet_list_column(parquet_column)) {
1532  auto omnisci_column_sub_type_column =
1533  get_sub_type_column_descriptor(omnisci_column);
1534  allowed_type = LazyParquetChunkLoader::isColumnMappingSupported(
1535  omnisci_column_sub_type_column.get(), parquet_column);
1536  }
1537  } else {
1538  allowed_type =
1539  LazyParquetChunkLoader::isColumnMappingSupported(omnisci_column, parquet_column);
1540  }
1541  if (!allowed_type) {
1542  auto logical_type = parquet_column->logical_type();
1543  if (logical_type->is_timestamp()) {
1544  auto timestamp_type =
1545  dynamic_cast<const parquet::TimestampLogicalType*>(logical_type.get());
1546  CHECK(timestamp_type);
1547 
1548  if (!timestamp_type->is_adjusted_to_utc()) {
1549  LOG(WARNING) << "Non-UTC timezone specified in Parquet file for column \""
1550  << omnisci_column->columnName
1551  << "\". Only UTC timezone is currently supported.";
1552  }
1553  }
1554  std::string parquet_type;
1555  parquet::Type::type physical_type = parquet_column->physical_type();
1556  if (parquet_column->logical_type()->is_none()) {
1557  parquet_type = parquet::TypeToString(physical_type);
1558  } else {
1559  parquet_type = logical_type->ToString();
1560  }
1561  std::string omnisci_type = omnisci_column->columnType.get_type_name();
1562  throw std::runtime_error{"Conversion from Parquet type \"" + parquet_type +
1563  "\" to HeavyDB type \"" + omnisci_type +
1564  "\" is not allowed. Please use an appropriate column type."};
1565  }
1566 }
#define LOG(tag)
Definition: Logger.h:285
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::string get_type_name() const
Definition: sqltypes.h:484
#define CHECK(condition)
Definition: Logger.h:291
void validate_max_repetition_and_definition_level(const ColumnDescriptor *omnisci_column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
SQLTypeInfo columnType
std::string columnName
bool is_array() const
Definition: sqltypes.h:585

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

MaxRowGroupSizeStats foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_column_mapping_and_row_group_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema,
const bool  do_metadata_stats_validation 
)

Definition at line 1654 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::getLogicalColumns(), is_valid_parquet_list_column(), report::stats, throw_missing_metadata_error(), and validate_allowed_mapping().

Referenced by validate_parquet_metadata().

1658  {
1659  auto column_it = schema.getLogicalColumns().begin();
1660  MaxRowGroupSizeStats max_row_group_stats{0, 0};
1661  for (int i = 0; i < file_metadata->num_columns(); ++i, ++column_it) {
1662  const parquet::ColumnDescriptor* descr = file_metadata->schema()->Column(i);
1663  try {
1664  validate_allowed_mapping(descr, *column_it);
1665  } catch (std::runtime_error& e) {
1666  std::stringstream error_message;
1667  error_message << e.what() << " Parquet column: " << descr->path()->ToDotString()
1668  << ", HeavyDB column: " << (*column_it)->columnName
1669  << ", Parquet file: " << file_path << ".";
1670  throw std::runtime_error(error_message.str());
1671  }
1672 
1673  for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
1674  auto group_metadata = file_metadata->RowGroup(r);
1675  auto num_rows = group_metadata->num_rows();
1676  if (num_rows == 0) {
1677  continue;
1678  } else if (num_rows > max_row_group_stats.max_row_group_size) {
1679  max_row_group_stats.max_row_group_size = num_rows;
1680  max_row_group_stats.max_row_group_index = r;
1681  max_row_group_stats.file_path = file_path;
1682  }
1683 
1684  if (do_metadata_stats_validation) {
1685  auto column_chunk = group_metadata->ColumnChunk(i);
1686  bool contains_metadata = column_chunk->is_stats_set();
1687  if (contains_metadata) {
1688  auto stats = column_chunk->statistics();
1689  bool is_all_nulls = stats->null_count() == column_chunk->num_values();
1690  bool is_list = is_valid_parquet_list_column(file_metadata->schema()->Column(i));
1691  // Given a list, it is possible it has no min or max if it is comprised
1692  // only of empty lists & nulls. This can not be detected by comparing
1693  // the null count; therefore we afford list types the benefit of the
1694  // doubt in this situation.
1695  if (!(stats->HasMinMax() || is_all_nulls || is_list)) {
1696  contains_metadata = false;
1697  }
1698  }
1699 
1700  if (!contains_metadata) {
1701  throw_missing_metadata_error(r, i, file_path);
1702  }
1703  }
1704  }
1705  }
1706  return max_row_group_stats;
1707 }
dictionary stats
Definition: report.py:116
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
void throw_missing_metadata_error(const int row_group_index, const int column_index, const std::string &file_path)
void validate_allowed_mapping(const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_date_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1447 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kDATE, kENCODING_DATE_IN_DAYS, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1448  {
1449  if (!(omnisci_column->columnType.get_type() == kDATE &&
1450  ((omnisci_column->columnType.get_compression() == kENCODING_DATE_IN_DAYS &&
1451  (omnisci_column->columnType.get_comp_param() ==
1452  0 // DATE ENCODING DAYS (32) specifies comp_param of 0
1453  || omnisci_column->columnType.get_comp_param() == 16)) ||
1454  omnisci_column->columnType.get_compression() ==
1455  kENCODING_NONE // for array types
1456  ))) {
1457  return false;
1458  }
1459  return parquet_column->logical_type()->is_date() ||
1460  parquet_column->logical_type()
1461  ->is_timestamp(); // to support TIMESTAMP -> DATE coercion
1462 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
Definition: sqltypes.h:80
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_decimal_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1182 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_precision(), SQLTypeInfo::get_scale(), SQLTypeInfo::is_decimal(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1183  {
1184  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1185  parquet_column->logical_type().get())) {
1186  return omnisci_column->columnType.get_precision() ==
1187  decimal_logical_column->precision() &&
1188  omnisci_column->columnType.get_scale() == decimal_logical_column->scale() &&
1189  omnisci_column->columnType.is_decimal() &&
1190  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1191  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1192  }
1193  return false;
1194 }
HOST DEVICE int get_scale() const
Definition: sqltypes.h:396
int get_precision() const
Definition: sqltypes.h:394
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
SQLTypeInfo columnType
bool is_decimal() const
Definition: sqltypes.h:570

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema ( const parquet::arrow::FileReader *  reference_file_reader,
const parquet::arrow::FileReader *  new_file_reader,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 1500 of file LazyParquetChunkLoader.cpp.

References foreign_storage::get_column_descriptor(), to_string(), and foreign_storage::validate_equal_column_descriptor().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan(), and foreign_storage::LazyParquetChunkLoader::previewFiles().

1503  {
1504  const auto reference_num_columns =
1505  reference_file_reader->parquet_reader()->metadata()->num_columns();
1506  const auto new_num_columns =
1507  new_file_reader->parquet_reader()->metadata()->num_columns();
1508  if (reference_num_columns != new_num_columns) {
1509  throw std::runtime_error{"Parquet file \"" + new_file_path +
1510  "\" has a different schema. Please ensure that all Parquet "
1511  "files use the same schema. Reference Parquet file: \"" +
1512  reference_file_path + "\" has " +
1513  std::to_string(reference_num_columns) +
1514  " columns. New Parquet file \"" + new_file_path + "\" has " +
1515  std::to_string(new_num_columns) + " columns."};
1516  }
1517 
1518  for (int i = 0; i < reference_num_columns; i++) {
1519  validate_equal_column_descriptor(get_column_descriptor(reference_file_reader, i),
1520  get_column_descriptor(new_file_reader, i),
1521  reference_file_path,
1522  new_file_path);
1523  }
1524 }
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
std::string to_string(char const *&&v)
const ColumnDescriptor * get_column_descriptor(const shared::ColumnKey &column_key)
Definition: Execute.h:213

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_floating_point_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1221 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), SQLTypeInfo::is_fp(), kENCODING_NONE, and kFLOAT.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1222  {
1223  if (!omnisci_column->columnType.is_fp()) {
1224  return false;
1225  }
1226  // check if mapping is a valid coerced or non-coerced floating point mapping
1227  // with no annotation (floating point columns have no annotation in the
1228  // Parquet specification)
1229  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1230  return (parquet_column->physical_type() == parquet::Type::DOUBLE) ||
1231  (parquet_column->physical_type() == parquet::Type::FLOAT &&
1232  omnisci_column->columnType.get_type() == kFLOAT);
1233  }
1234  return false;
1235 }
bool is_fp() const
Definition: sqltypes.h:573
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_geospatial_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1494 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_geometry(), and is_valid_parquet_string().

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1495  {
1496  return is_valid_parquet_string(parquet_column) &&
1497  omnisci_column->columnType.is_geometry();
1498 }
bool is_geometry() const
Definition: sqltypes.h:597
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_integral_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1252 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_size(), SQLTypeInfo::is_integer(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1253  {
1254  if (!omnisci_column->columnType.is_integer()) {
1255  return false;
1256  }
1257  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1258  parquet_column->logical_type().get())) {
1259  CHECK(omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1260  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1261  const int bits_per_byte = 8;
1262  // unsigned types are permitted to map to a wider integral type in order to avoid
1263  // precision loss
1264  const int bit_widening_factor = int_logical_column->is_signed() ? 1 : 2;
1265  return omnisci_column->columnType.get_size() * bits_per_byte <=
1266  int_logical_column->bit_width() * bit_widening_factor;
1267  }
1268  // check if mapping is a valid coerced or non-coerced integral mapping with no
1269  // annotation
1270  if ((omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1271  omnisci_column->columnType.get_compression() == kENCODING_FIXED)) {
1272  return (parquet_column->physical_type() == parquet::Type::INT64) ||
1273  (parquet_column->physical_type() == parquet::Type::INT32 &&
1274  omnisci_column->columnType.get_size() <= 4);
1275  }
1276  return false;
1277 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
bool is_integer() const
Definition: sqltypes.h:567
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_list_column_metadata_statistics ( const parquet::ParquetFileReader *  reader,
const int  row_group_index,
const int  column_index,
const int16_t *  def_levels,
const int64_t  num_levels,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1078 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_list_column(), report::stats, and foreign_storage::validate_and_get_column_metadata_statistics().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1084  {
1085  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1086  if (!is_valid_parquet_list) {
1087  return;
1088  }
1089  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1090  reader->metadata()->RowGroup(row_group_index);
1091  auto column_metadata = group_metadata->ColumnChunk(column_index);
1092  // In case of a empty row group do not validate
1093  if (group_metadata->num_rows() == 0) {
1094  return;
1095  }
1096  auto stats = validate_and_get_column_metadata_statistics(column_metadata.get());
1097  if (!stats->HasMinMax()) {
1098  auto find_it = std::find_if(def_levels,
1099  def_levels + num_levels,
1100  [](const int16_t def_level) { return def_level == 3; });
1101  if (find_it != def_levels + num_levels) {
1102  throw std::runtime_error(
1103  "No minimum and maximum statistic set in list column but non-null & non-empty "
1104  "array/value detected.");
1105  }
1106  }
1107 }
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
dictionary stats
Definition: report.py:116
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_max_repetition_and_definition_level ( const ColumnDescriptor omnisci_column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1129 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::is_array(), is_valid_parquet_list_column(), and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_allowed_mapping().

1131  {
1132  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1133  if (is_valid_parquet_list && !omnisci_column_descriptor->columnType.is_array()) {
1134  throw std::runtime_error(
1135  "Unsupported mapping detected. Column '" +
1136  parquet_column_descriptor->path()->ToDotString() +
1137  "' detected to be a parquet list but HeavyDB mapped column '" +
1138  omnisci_column_descriptor->columnName + "' is not an array.");
1139  }
1140  if (is_valid_parquet_list) {
1141  if (parquet_column_descriptor->max_repetition_level() != 1 ||
1142  parquet_column_descriptor->max_definition_level() != 3) {
1143  throw std::runtime_error(
1144  "Incorrect schema max repetition level detected in column '" +
1145  parquet_column_descriptor->path()->ToDotString() +
1146  "'. Expected a max repetition level of 1 and max definition level of 3 for "
1147  "list column but column has a max "
1148  "repetition level of " +
1149  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1150  " and a max definition level of " +
1151  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1152  }
1153  } else {
1154  if (parquet_column_descriptor->max_repetition_level() != 0 ||
1155  !(parquet_column_descriptor->max_definition_level() == 1 ||
1156  parquet_column_descriptor->max_definition_level() == 0)) {
1157  throw std::runtime_error(
1158  "Incorrect schema max repetition level detected in column '" +
1159  parquet_column_descriptor->path()->ToDotString() +
1160  "'. Expected a max repetition level of 0 and max definition level of 1 or 0 "
1161  "for "
1162  "flat column but column has a max "
1163  "repetition level of " +
1164  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1165  " and a max definition level of " +
1166  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1167  }
1168  }
1169 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::string to_string(char const *&&v)
SQLTypeInfo columnType
std::string columnName
bool is_array() const
Definition: sqltypes.h:585

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_none_type_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1351 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kBOOLEAN, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1352  {
1353  bool is_none_encoded_mapping =
1354  omnisci_column->columnType.get_compression() == kENCODING_NONE &&
1355  (parquet_column->physical_type() == parquet::Type::BOOLEAN &&
1356  omnisci_column->columnType.get_type() == kBOOLEAN);
1357  return parquet_column->logical_type()->is_none() && is_none_encoded_mapping;
1358 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1612 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::numLogicalColumns(), and foreign_storage::throw_number_of_columns_mismatch_error().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_parquet_metadata().

1615  {
1616  if (schema.numLogicalColumns() != file_metadata->num_columns()) {
1618  schema.numLogicalColumns(), file_metadata->num_columns(), file_path);
1619  }
1620 }
void throw_number_of_columns_mismatch_error(size_t num_table_cols, size_t num_file_cols, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

MaxRowGroupSizeStats foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_parquet_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema,
const bool  do_metadata_stats_validation 
)

Definition at line 1709 of file LazyParquetChunkLoader.cpp.

References validate_column_mapping_and_row_group_metadata(), and validate_number_of_columns().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1713  {
1714  validate_number_of_columns(file_metadata, file_path, schema);
1716  file_metadata, file_path, schema, do_metadata_stats_validation);
1717 }
MaxRowGroupSizeStats validate_column_mapping_and_row_group_metadata(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema, const bool do_metadata_stats_validation)
void validate_number_of_columns(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_string_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1473 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1474  {
1475  return is_valid_parquet_string(parquet_column) &&
1476  omnisci_column->columnType.is_string() &&
1477  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1478  omnisci_column->columnType.get_compression() == kENCODING_DICT);
1479 }
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:561

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_time_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1424 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kENCODING_FIXED, kENCODING_NONE, and kTIME.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1425  {
1426  if (!(omnisci_column->columnType.get_type() == kTIME &&
1427  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1428  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1429  omnisci_column->columnType.get_comp_param() == 32)))) {
1430  return false;
1431  }
1432  if (parquet_column->logical_type()->is_time()) {
1433  return true;
1434  }
1435  return false;
1436 }
Definition: sqltypes.h:76
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_timestamp_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1369 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_dimension(), SQLTypeInfo::get_type(), is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_FIXED, kENCODING_NONE, and kTIMESTAMP.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1370  {
1371  if (!(omnisci_column->columnType.get_type() == kTIMESTAMP &&
1372  ((omnisci_column->columnType.get_compression() == kENCODING_NONE) ||
1373  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1374  omnisci_column->columnType.get_comp_param() == 32)))) {
1375  return false;
1376  }
1377  // check the annotated case
1378  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1379  parquet_column->logical_type().get())) {
1380  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1381  return omnisci_column->columnType.get_dimension() == 0 ||
1382  ((is_nanosecond_precision(omnisci_column) &&
1383  is_nanosecond_precision(timestamp_logical_column)) ||
1384  (is_microsecond_precision(omnisci_column) &&
1385  is_microsecond_precision(timestamp_logical_column)) ||
1386  (is_millisecond_precision(omnisci_column) &&
1387  is_millisecond_precision(timestamp_logical_column)));
1388  }
1389  if (omnisci_column->columnType.get_compression() == kENCODING_FIXED) {
1390  return omnisci_column->columnType.get_dimension() == 0;
1391  }
1392  }
1393  // check the unannotated case
1394  if (parquet_column->logical_type()->is_none() &&
1395  ((parquet_column->physical_type() == parquet::Type::INT32 &&
1396  omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1397  omnisci_column->columnType.get_comp_param() == 32) ||
1398  parquet_column->physical_type() == parquet::Type::INT64)) {
1399  return true;
1400  }
1401  return false;
1402 }
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::within_range ( int64_t  lower_bound,
int64_t  upper_bound,
int64_t  value 
)

Definition at line 56 of file LazyParquetChunkLoader.cpp.

References gpu_enabled::upper_bound().

Referenced by suggest_integral_mapping().

56  {
57  return value >= lower_bound && value <= upper_bound;
58 }
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78

+ Here is the call graph for this function:

+ Here is the caller graph for this function: