OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp} Namespace Reference

Classes

struct  MaxRowGroupSizeStats
 

Functions

bool within_range (int64_t lower_bound, int64_t upper_bound, int64_t value)
 
bool is_valid_parquet_string (const parquet::ColumnDescriptor *parquet_column)
 
bool is_valid_parquet_list_column (const parquet::ColumnDescriptor *parquet_column)
 Detect a valid list parquet column. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder_with_omnisci_type (const ColumnDescriptor *column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename U , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_signed_or_unsigned_integral_encoder_with_types (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
 Create a signed or unsigned integral parquet encoder using types. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder_with_omnisci_type (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const int bit_width, const bool is_signed)
 Create a integral parquet encoder using types. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_floating_point_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_none_type_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_string_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import, const bool is_for_detect)
 
std::shared_ptr< ParquetEncodercreate_parquet_geospatial_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan, const bool is_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_array_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan=false, const bool is_for_import=false, const bool is_for_detect=false)
 Create a Parquet specific encoder for a Parquet to OmniSci mapping. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_import (std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary, const RenderGroupAnalyzerMap *render_group_analyzer_map)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_metadata_scan (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const RenderGroupAnalyzerMap *render_group_analyzer_map)
 
void validate_definition_levels (const parquet::ParquetFileReader *reader, const int row_group_index, const int column_index, const int16_t *def_levels, const int64_t num_levels, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void validate_max_repetition_and_definition_level (const ColumnDescriptor *omnisci_column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void resize_values_buffer (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::vector< int8_t > &values)
 
bool validate_decimal_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_decimal_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_floating_point_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_floating_point_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_integral_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_integral_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool is_nanosecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_nanosecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_microsecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_microsecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_millisecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_millisecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool validate_none_type_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_boolean_type_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_timestamp_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_timestamp_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_time_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_time_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_date_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_date_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_string_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_string_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_array_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_geospatial_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
void validate_equal_schema (const parquet::arrow::FileReader *reference_file_reader, const parquet::arrow::FileReader *new_file_reader, const std::string &reference_file_path, const std::string &new_file_path)
 
void validate_allowed_mapping (const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)
 
SQLTypeInfo suggest_column_scalar_type (const parquet::ColumnDescriptor *parquet_column)
 
void validate_number_of_columns (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
void throw_missing_metadata_error (const int row_group_index, const int column_index, const std::string &file_path)
 
void throw_row_group_larger_than_fragment_size_error (const MaxRowGroupSizeStats max_row_group_stats, const int fragment_size)
 
MaxRowGroupSizeStats validate_column_mapping_and_row_group_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
MaxRowGroupSizeStats validate_parquet_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
std::list< RowGroupMetadatametadata_scan_rowgroup_interval (const std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const RowGroupInterval &row_group_interval, const ReaderPtr &reader, const ForeignTableSchema &schema)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_import (const std::map< int, Chunk_NS::Chunk > chunks, const ForeignTableSchema &schema, const ReaderPtr &reader, const std::map< int, StringDictionary * > column_dictionaries, const int64_t num_rows, const RenderGroupAnalyzerMap *render_group_analyzer_map)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_metadata_scan (const Interval< ColumnType > &column_interval, const ForeignTableSchema &schema, const ReaderPtr &reader, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool do_metadata_stats_validation)
 

Function Documentation

std::shared_ptr< ParquetEncoder > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import,
const bool  is_for_detect 
)

Definition at line 1017 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_encoder(), foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::is_array(), SQLTypeInfo::is_fixlen_array(), and is_valid_parquet_list_column().

Referenced by create_parquet_encoder().

1025  {
1026  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column);
1027  if (!is_valid_parquet_list || !omnisci_column->columnType.is_array()) {
1028  return {};
1029  }
1030  std::unique_ptr<ColumnDescriptor> omnisci_column_sub_type_column =
1031  get_sub_type_column_descriptor(omnisci_column);
1032  auto encoder = create_parquet_encoder(omnisci_column_sub_type_column.get(),
1033  parquet_column,
1034  chunks,
1035  string_dictionary,
1036  chunk_metadata,
1037  nullptr,
1038  is_metadata_scan,
1039  is_for_import,
1040  is_for_detect);
1041  CHECK(encoder.get());
1042  auto scalar_encoder = std::dynamic_pointer_cast<ParquetScalarEncoder>(encoder);
1043  CHECK(scalar_encoder);
1044  if (!is_for_import) {
1045  if (!is_for_detect) {
1046  if (omnisci_column->columnType.is_fixlen_array()) {
1047  encoder = std::make_shared<ParquetFixedLengthArrayEncoder>(
1048  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1049  scalar_encoder,
1050  omnisci_column);
1051  } else {
1052  encoder = std::make_shared<ParquetVariableLengthArrayEncoder>(
1053  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1054  is_metadata_scan ? nullptr : chunks.begin()->getIndexBuf(),
1055  scalar_encoder,
1056  omnisci_column);
1057  }
1058  } else { // is_for_detect
1059  encoder = std::make_shared<ParquetArrayDetectEncoder>(
1060  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1061  }
1062  } else { // is_for_import
1063  encoder = std::make_shared<ParquetArrayImportEncoder>(
1064  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1065  }
1066  return encoder;
1067 }
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
bool is_fixlen_array() const
Definition: sqltypes.h:520
#define CHECK(condition)
Definition: Logger.h:223
SQLTypeInfo columnType
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan=false, const bool is_for_import=false, const bool is_for_detect=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.
bool is_array() const
Definition: sqltypes.h:518

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 735 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, kENCODING_DATE_IN_DAYS, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

739  {
740  auto column_type = omnisci_column->columnType;
741  if (parquet_column->logical_type()->is_date() && column_type.is_date()) {
742  if (column_type.get_compression() == kENCODING_DATE_IN_DAYS) {
743  if (is_metadata_scan_or_for_import) {
744  if (column_type.get_comp_param() ==
745  0) { // DATE ENCODING FIXED (32) uses comp param 0
746  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int32_t>>(
747  buffer);
748  } else if (column_type.get_comp_param() == 16) {
749  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int16_t>>(
750  buffer);
751  } else {
752  UNREACHABLE();
753  }
754  } else {
755  if (column_type.get_comp_param() ==
756  0) { // DATE ENCODING FIXED (32) uses comp param 0
757  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t>>(
758  buffer, omnisci_column, parquet_column);
759  } else if (column_type.get_comp_param() == 16) {
760  return std::make_shared<ParquetFixedLengthEncoder<int16_t, int32_t>>(
761  buffer, omnisci_column, parquet_column);
762  } else {
763  UNREACHABLE();
764  }
765  }
766  } else if (column_type.get_compression() == kENCODING_NONE) { // for array types
767  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int64_t>>(
768  buffer, omnisci_column, parquet_column);
769  } else {
770  UNREACHABLE();
771  }
772  }
773  return {};
774 }
#define UNREACHABLE()
Definition: Logger.h:267
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 692 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_date_from_timestamp_encoder_with_types(), kENCODING_DATE_IN_DAYS, and UNREACHABLE.

Referenced by create_parquet_encoder().

696  {
697  auto column_type = omnisci_column->columnType;
698  if (parquet_column->logical_type()->is_timestamp() && column_type.is_date()) {
699  CHECK(column_type.get_compression() == kENCODING_DATE_IN_DAYS);
700  if (is_metadata_scan_or_for_import) {
701  if (column_type.get_comp_param() ==
702  0) { // DATE ENCODING FIXED (32) uses comp param 0
704  int64_t,
705  int32_t>(
706  omnisci_column, parquet_column, buffer, true);
707  } else if (column_type.get_comp_param() == 16) {
709  int64_t,
710  int16_t>(
711  omnisci_column, parquet_column, buffer, true);
712  } else {
713  UNREACHABLE();
714  }
715  } else {
716  if (column_type.get_comp_param() ==
717  0) { // DATE ENCODING FIXED (32) uses comp param 0
719  int64_t,
720  int32_t>(
721  omnisci_column, parquet_column, buffer, false);
722  } else if (column_type.get_comp_param() == 16) {
724  int64_t,
725  int16_t>(
726  omnisci_column, parquet_column, buffer, false);
727  } else {
728  UNREACHABLE();
729  }
730  }
731  }
732  return {};
733 }
#define UNREACHABLE()
Definition: Logger.h:267
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder_with_types(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
#define CHECK(condition)
Definition: Logger.h:223
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 507 of file LazyParquetChunkLoader.cpp.

References heavydb.dtypes::T, and UNREACHABLE.

Referenced by create_parquet_date_from_timestamp_encoder().

511  {
512  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
513  parquet_column->logical_type().get())) {
514  switch (timestamp_logical_type->time_unit()) {
515  case parquet::LogicalType::TimeUnit::MILLIS:
516  if (is_metadata_scan_or_for_import) {
517  return std::make_shared<
518  ParquetDateInSecondsFromTimestampEncoder<V, T, 1000L, NullType>>(
519  buffer, omnisci_column, parquet_column);
520  }
521  return std::make_shared<
522  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L, NullType>>(
523  buffer, omnisci_column, parquet_column);
524  case parquet::LogicalType::TimeUnit::MICROS:
525  if (is_metadata_scan_or_for_import) {
526  return std::make_shared<
527  ParquetDateInSecondsFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
528  buffer, omnisci_column, parquet_column);
529  }
530  return std::make_shared<
531  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
532  buffer, omnisci_column, parquet_column);
533  case parquet::LogicalType::TimeUnit::NANOS:
534  if (is_metadata_scan_or_for_import) {
535  return std::make_shared<
537  T,
538  1000L * 1000L * 1000L,
539  NullType>>(
540  buffer, omnisci_column, parquet_column);
541  }
542  return std::make_shared<
543  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
544  buffer, omnisci_column, parquet_column);
545  default:
546  UNREACHABLE();
547  }
548  } else {
549  UNREACHABLE();
550  }
551  return {};
552 }
ParquetTimestampEncoder< V, T, conversion_denominator, NullType > ParquetDateInSecondsFromTimestampEncoder
#define UNREACHABLE()
Definition: Logger.h:267

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 170 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

174  {
175  if (parquet_column->logical_type()->is_decimal()) {
176  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
177  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int64_t>(
178  omnisci_column, parquet_column, buffer);
179  }
180  CHECK(omnisci_column->columnType.get_compression() == kENCODING_FIXED);
181  if (is_metadata_scan_or_for_import) {
182  switch (omnisci_column->columnType.get_comp_param()) {
183  case 16:
184  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int16_t>(
185  omnisci_column, parquet_column, buffer);
186  case 32:
187  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int32_t>(
188  omnisci_column, parquet_column, buffer);
189  default:
190  UNREACHABLE();
191  }
192  } else {
193  switch (omnisci_column->columnType.get_comp_param()) {
194  case 16:
195  return create_parquet_decimal_encoder_with_omnisci_type<int16_t, int16_t>(
196  omnisci_column, parquet_column, buffer);
197  case 32:
198  return create_parquet_decimal_encoder_with_omnisci_type<int32_t, int32_t>(
199  omnisci_column, parquet_column, buffer);
200  default:
201  UNREACHABLE();
202  }
203  }
204  }
205  return {};
206 }
#define UNREACHABLE()
Definition: Logger.h:267
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
#define CHECK(condition)
Definition: Logger.h:223
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder_with_omnisci_type ( const ColumnDescriptor column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor,
AbstractBuffer buffer 
)

Definition at line 146 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

149  {
150  switch (parquet_column_descriptor->physical_type()) {
151  case parquet::Type::INT32:
152  return std::make_shared<ParquetDecimalEncoder<V, int32_t, NullType>>(
153  buffer, column_descriptor, parquet_column_descriptor);
154  case parquet::Type::INT64:
155  return std::make_shared<ParquetDecimalEncoder<V, int64_t, NullType>>(
156  buffer, column_descriptor, parquet_column_descriptor);
157  case parquet::Type::FIXED_LEN_BYTE_ARRAY:
158  return std::make_shared<
159  ParquetDecimalEncoder<V, parquet::FixedLenByteArray, NullType>>(
160  buffer, column_descriptor, parquet_column_descriptor);
161  case parquet::Type::BYTE_ARRAY:
162  return std::make_shared<ParquetDecimalEncoder<V, parquet::ByteArray, NullType>>(
163  buffer, column_descriptor, parquet_column_descriptor);
164  default:
165  UNREACHABLE();
166  }
167  return {};
168 }
#define UNREACHABLE()
Definition: Logger.h:267
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const RenderGroupAnalyzerMap render_group_analyzer_map,
const bool  is_metadata_scan = false,
const bool  is_for_import = false,
const bool  is_for_detect = false 
)

Create a Parquet specific encoder for a Parquet to OmniSci mapping.

Parameters
omnisci_column- the descriptor of OmniSci column
parquet_column- the descriptor of Parquet column
chunks- list of chunks to populate (the case of more than one chunk happens only if a logical column expands to multiple physical columns)
string_dictionary- string dictionary used in encoding for string dictionary encoded columns
chunk_metadata- similar to the list of chunks, a list of chunk metadata that is populated
is_metadata_scan- a flag indicating if the encoders created should be for a metadata scan
is_for_import- a flag indicating if the encoders created should be for import
Returns
An appropriate Parquet encoder for the use case defined by the Parquet to OmniSci mapping.

Notes:

  • In the case of a metadata scan, the type of the encoder created may significantly change (for example in bit width.) This is because it is common for OmniSci to store metadata in a different format altogether than the data itself (see for example FixedLengthEncoder.)
  • This function and the function isColumnMappingSupported work in conjunction with each other. For example, once a mapping is known to be allowed (since isColumnMappingSupported returned true) this function does not have to check many corner cases exhaustively as it would be redundant with what was checked in isColumnMappingSupported.

Definition at line 901 of file LazyParquetChunkLoader.cpp.

References CHECK, create_parquet_array_encoder(), create_parquet_date_encoder(), create_parquet_date_from_timestamp_encoder(), create_parquet_decimal_encoder(), create_parquet_floating_point_encoder(), create_parquet_geospatial_encoder(), create_parquet_integral_encoder(), create_parquet_none_type_encoder(), create_parquet_string_encoder(), create_parquet_time_encoder(), create_parquet_timestamp_encoder(), and UNREACHABLE.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), create_parquet_array_encoder(), create_parquet_encoder_for_import(), and create_parquet_encoder_for_metadata_scan().

910  {
911  CHECK(!(is_metadata_scan && is_for_import));
912  auto buffer = chunks.empty() ? nullptr : chunks.begin()->getBuffer();
913  if (auto encoder = create_parquet_geospatial_encoder(omnisci_column,
914  parquet_column,
915  chunks,
916  chunk_metadata,
917  render_group_analyzer_map,
918  is_metadata_scan,
919  is_for_import)) {
920  return encoder;
921  }
922  if (auto encoder = create_parquet_array_encoder(omnisci_column,
923  parquet_column,
924  chunks,
925  string_dictionary,
926  chunk_metadata,
927  is_metadata_scan,
928  is_for_import,
929  is_for_detect)) {
930  return encoder;
931  }
932  if (auto encoder = create_parquet_decimal_encoder(
933  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
934  return encoder;
935  }
936  if (auto encoder = create_parquet_integral_encoder(
937  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
938  return encoder;
939  }
940  if (auto encoder =
941  create_parquet_floating_point_encoder(omnisci_column, parquet_column, buffer)) {
942  return encoder;
943  }
944  if (auto encoder = create_parquet_timestamp_encoder(
945  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
946  return encoder;
947  }
948  if (auto encoder =
949  create_parquet_none_type_encoder(omnisci_column, parquet_column, buffer)) {
950  return encoder;
951  }
952  if (auto encoder = create_parquet_time_encoder(
953  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
954  return encoder;
955  }
957  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
958  return encoder;
959  }
960  if (auto encoder = create_parquet_date_encoder(
961  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
962  return encoder;
963  }
964  if (auto encoder = create_parquet_string_encoder(
965  omnisci_column,
966  parquet_column,
967  chunks.empty() ? Chunk_NS::Chunk{} : *chunks.begin(),
968  string_dictionary,
969  chunk_metadata,
970  is_for_import,
971  is_for_detect)) {
972  return encoder;
973  }
974  UNREACHABLE();
975  return {};
976 }
std::shared_ptr< ParquetEncoder > create_parquet_geospatial_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan, const bool is_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_array_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect)
#define UNREACHABLE()
Definition: Logger.h:267
std::shared_ptr< ParquetEncoder > create_parquet_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_none_type_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
std::shared_ptr< ParquetEncoder > create_parquet_time_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_date_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_string_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import, const bool is_for_detect)
std::shared_ptr< ParquetEncoder > create_parquet_floating_point_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
#define CHECK(condition)
Definition: Logger.h:223
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_decimal_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_integral_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_import ( std::list< Chunk_NS::Chunk > &  chunks,
const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
StringDictionary string_dictionary,
const RenderGroupAnalyzerMap render_group_analyzer_map 
)

Intended to be used for the import case.

Definition at line 981 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_import().

986  {
987  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
988  return create_parquet_encoder(omnisci_column,
989  parquet_column,
990  chunks,
991  string_dictionary,
992  chunk_metadata,
993  render_group_analyzer_map,
994  false,
995  true);
996 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan=false, const bool is_for_import=false, const bool is_for_detect=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_metadata_scan ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const RenderGroupAnalyzerMap render_group_analyzer_map 
)

Intended to be used only with metadata scan. Creates an incomplete encoder capable of updating metadata.

Definition at line 1002 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_metadata_scan().

1005  {
1006  std::list<Chunk_NS::Chunk> chunks;
1007  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
1008  return create_parquet_encoder(omnisci_column,
1009  parquet_column,
1010  chunks,
1011  nullptr,
1012  chunk_metadata,
1013  render_group_analyzer_map,
1014  true);
1015 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan=false, const bool is_for_import=false, const bool is_for_detect=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_floating_point_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 426 of file LazyParquetChunkLoader.cpp.

References CHECK, CHECK_EQ, ColumnDescriptor::columnType, kDOUBLE, kENCODING_NONE, kFLOAT, and UNREACHABLE.

Referenced by create_parquet_encoder().

429  {
430  auto column_type = omnisci_column->columnType;
431  if (!column_type.is_fp()) {
432  return {};
433  }
434  CHECK_EQ(column_type.get_compression(), kENCODING_NONE);
435  switch (column_type.get_type()) {
436  case kFLOAT:
437  switch (parquet_column->physical_type()) {
438  case parquet::Type::FLOAT:
439  return std::make_shared<ParquetFixedLengthEncoder<float, float>>(
440  buffer, omnisci_column, parquet_column);
441  case parquet::Type::DOUBLE:
442  return std::make_shared<ParquetFixedLengthEncoder<float, double>>(
443  buffer, omnisci_column, parquet_column);
444  default:
445  UNREACHABLE();
446  }
447  case kDOUBLE:
448  CHECK(parquet_column->physical_type() == parquet::Type::DOUBLE);
449  return std::make_shared<ParquetFixedLengthEncoder<double, double>>(
450  buffer, omnisci_column, parquet_column);
451  default:
452  UNREACHABLE();
453  }
454  return {};
455 }
#define CHECK_EQ(x, y)
Definition: Logger.h:231
#define UNREACHABLE()
Definition: Logger.h:267
#define CHECK(condition)
Definition: Logger.h:223
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_geospatial_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const RenderGroupAnalyzerMap render_group_analyzer_map,
const bool  is_metadata_scan,
const bool  is_for_import 
)

Definition at line 829 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and is_valid_parquet_string().

Referenced by create_parquet_encoder().

836  {
837  auto column_type = omnisci_column->columnType;
838  if (!is_valid_parquet_string(parquet_column) || !column_type.is_geometry()) {
839  return {};
840  }
841  if (is_for_import) {
842  return std::make_shared<ParquetGeospatialImportEncoder>(chunks); // no RGAMap
843  }
844  if (is_metadata_scan) {
845  return std::make_shared<ParquetGeospatialEncoder>(render_group_analyzer_map);
846  }
847  for (auto chunks_iter = chunks.begin(); chunks_iter != chunks.end(); ++chunks_iter) {
848  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
849  auto& chunk_metadata_ptr = chunk_metadata.back();
850  chunk_metadata_ptr->sqlType = chunks_iter->getColumnDesc()->columnType;
851  }
852  return std::make_shared<ParquetGeospatialEncoder>(
853  parquet_column, chunks, chunk_metadata, render_group_analyzer_map);
854 }
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 296 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, and UNREACHABLE.

Referenced by create_parquet_encoder().

300  {
301  auto column_type = omnisci_column->columnType;
302  auto physical_type = parquet_column->physical_type();
303 
304  int bit_width = -1;
305  int is_signed = false;
306  // handle the integral case with no Parquet annotation
307  if (parquet_column->logical_type()->is_none() && column_type.is_integer()) {
308  if (physical_type == parquet::Type::INT32) {
309  bit_width = 32;
310  } else if (physical_type == parquet::Type::INT64) {
311  bit_width = 64;
312  } else {
313  UNREACHABLE();
314  }
315  is_signed = true;
316  }
317  // handle the integral case with Parquet annotation
318  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
319  parquet_column->logical_type().get())) {
320  bit_width = int_logical_column->bit_width();
321  is_signed = int_logical_column->is_signed();
322  }
323 
324  if (bit_width == -1) { // no valid logical type (with or without annotation) found
325  return {};
326  }
327 
328  const size_t omnisci_data_type_byte_size = column_type.get_size();
329  const size_t parquet_data_type_byte_size = parquet::GetTypeByteSize(physical_type);
330 
331  switch (omnisci_data_type_byte_size) {
332  case 8:
333  CHECK(column_type.get_compression() == kENCODING_NONE);
334  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int64_t>(
335  buffer,
336  omnisci_data_type_byte_size,
337  parquet_data_type_byte_size,
338  bit_width,
339  is_signed);
340  case 4:
341  if (is_metadata_scan_or_for_import && column_type.get_type() == kBIGINT) {
342  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int32_t>(
343  buffer,
344  omnisci_data_type_byte_size,
345  parquet_data_type_byte_size,
346  bit_width,
347  is_signed);
348  }
349  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int32_t>(
350  buffer,
351  omnisci_data_type_byte_size,
352  parquet_data_type_byte_size,
353  bit_width,
354  is_signed);
355  case 2:
356  if (is_metadata_scan_or_for_import) {
357  switch (column_type.get_type()) {
358  case kBIGINT:
359  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int16_t>(
360  buffer,
361  omnisci_data_type_byte_size,
362  parquet_data_type_byte_size,
363  bit_width,
364  is_signed);
365  case kINT:
366  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int16_t>(
367  buffer,
368  omnisci_data_type_byte_size,
369  parquet_data_type_byte_size,
370  bit_width,
371  is_signed);
372  case kSMALLINT:
373  break;
374  default:
375  UNREACHABLE();
376  }
377  }
378  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int16_t>(
379  buffer,
380  omnisci_data_type_byte_size,
381  parquet_data_type_byte_size,
382  bit_width,
383  is_signed);
384  case 1:
385  if (is_metadata_scan_or_for_import) {
386  switch (column_type.get_type()) {
387  case kBIGINT:
388  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int8_t>(
389  buffer,
390  omnisci_data_type_byte_size,
391  parquet_data_type_byte_size,
392  bit_width,
393  is_signed);
394  case kINT:
395  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int8_t>(
396  buffer,
397  omnisci_data_type_byte_size,
398  parquet_data_type_byte_size,
399  bit_width,
400  is_signed);
401  case kSMALLINT:
402  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int8_t>(
403  buffer,
404  omnisci_data_type_byte_size,
405  parquet_data_type_byte_size,
406  bit_width,
407  is_signed);
408  case kTINYINT:
409  break;
410  default:
411  UNREACHABLE();
412  }
413  }
414  return create_parquet_integral_encoder_with_omnisci_type<int8_t, int8_t>(
415  buffer,
416  omnisci_data_type_byte_size,
417  parquet_data_type_byte_size,
418  bit_width,
419  is_signed);
420  default:
421  UNREACHABLE();
422  }
423  return {};
424 }
#define UNREACHABLE()
Definition: Logger.h:267
#define CHECK(condition)
Definition: Logger.h:223
Definition: sqltypes.h:45
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder_with_omnisci_type ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const int  bit_width,
const bool  is_signed 
)

Create a integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
bit_width- bit width specified for the Parquet column
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated type V and NullType.

Note, this function determines the appropriate bit depth integral encoder to create, while create_parquet_signed_or_unsigned_integral_encoder_with_types determines whether to create a signed or unsigned integral encoder.

Definition at line 259 of file LazyParquetChunkLoader.cpp.

References create_parquet_signed_or_unsigned_integral_encoder_with_types(), and UNREACHABLE.

264  {
265  switch (bit_width) {
266  case 8:
268  int32_t,
269  uint8_t,
270  NullType>(
271  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
272  case 16:
274  int32_t,
275  uint16_t,
276  NullType>(
277  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
278  case 32:
280  int32_t,
281  uint32_t,
282  NullType>(
283  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
284  case 64:
286  int64_t,
287  uint64_t,
288  NullType>(
289  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
290  default:
291  UNREACHABLE();
292  }
293  return {};
294 }
std::shared_ptr< ParquetEncoder > create_parquet_signed_or_unsigned_integral_encoder_with_types(AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
Create a signed or unsigned integral parquet encoder using types.
#define UNREACHABLE()
Definition: Logger.h:267

+ Here is the call graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_none_type_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 457 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_string(), kBOOLEAN, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

460  {
461  auto column_type = omnisci_column->columnType;
462  if (parquet_column->logical_type()->is_none() &&
463  !omnisci_column->columnType.is_string()) { // boolean
464  if (column_type.get_compression() == kENCODING_NONE) {
465  switch (column_type.get_type()) {
466  case kBOOLEAN:
467  return std::make_shared<ParquetFixedLengthEncoder<int8_t, bool>>(
468  buffer, omnisci_column, parquet_column);
469  default:
470  UNREACHABLE();
471  }
472  } else {
473  UNREACHABLE();
474  }
475  }
476  return {};
477 }
#define UNREACHABLE()
Definition: Logger.h:267
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:510

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename U , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_signed_or_unsigned_integral_encoder_with_types ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const bool  is_signed 
)

Create a signed or unsigned integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated types V, T, U, and NullType.

Definition at line 224 of file LazyParquetChunkLoader.cpp.

References CHECK.

Referenced by create_parquet_integral_encoder_with_omnisci_type().

228  {
229  CHECK(sizeof(NullType) == omnisci_data_type_byte_size);
230  if (is_signed) {
231  return std::make_shared<ParquetFixedLengthEncoder<V, T, NullType>>(
232  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
233  } else {
234  return std::make_shared<ParquetUnsignedFixedLengthEncoder<V, T, U, NullType>>(
235  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
236  }
237 }
#define CHECK(condition)
Definition: Logger.h:223

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_string_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const Chunk_NS::Chunk chunk,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
bool  is_for_import,
const bool  is_for_detect 
)

Definition at line 776 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, Chunk_NS::Chunk::getBuffer(), Chunk_NS::Chunk::getIndexBuf(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

783  {
784  auto column_type = omnisci_column->columnType;
785  if (!is_valid_parquet_string(parquet_column) ||
786  !omnisci_column->columnType.is_string()) {
787  return {};
788  }
789  if (column_type.get_compression() == kENCODING_NONE) {
790  if (is_for_import) {
791  return std::make_shared<ParquetStringImportEncoder>(chunk.getBuffer());
792  } else {
793  return std::make_shared<ParquetStringNoneEncoder>(chunk.getBuffer(),
794  chunk.getIndexBuf());
795  }
796  } else if (column_type.get_compression() == kENCODING_DICT) {
797  if (!is_for_detect) { // non-detect use case
798  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
799  std::unique_ptr<ChunkMetadata>& logical_chunk_metadata = chunk_metadata.back();
800  logical_chunk_metadata->sqlType = omnisci_column->columnType;
801  switch (column_type.get_size()) {
802  case 1:
803  return std::make_shared<ParquetStringEncoder<uint8_t>>(
804  chunk.getBuffer(),
805  string_dictionary,
806  is_for_import ? nullptr : logical_chunk_metadata.get());
807  case 2:
808  return std::make_shared<ParquetStringEncoder<uint16_t>>(
809  chunk.getBuffer(),
810  string_dictionary,
811  is_for_import ? nullptr : logical_chunk_metadata.get());
812  case 4:
813  return std::make_shared<ParquetStringEncoder<int32_t>>(
814  chunk.getBuffer(),
815  string_dictionary,
816  is_for_import ? nullptr : logical_chunk_metadata.get());
817  default:
818  UNREACHABLE();
819  }
820  } else { // detect use-case
821  return std::make_shared<ParquetDetectStringEncoder>(chunk.getBuffer());
822  }
823  } else {
824  UNREACHABLE();
825  }
826  return {};
827 }
AbstractBuffer * getIndexBuf() const
Definition: Chunk.h:148
#define UNREACHABLE()
Definition: Logger.h:267
AbstractBuffer * getBuffer() const
Definition: Chunk.h:146
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:510

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 639 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

643  {
644  auto column_type = omnisci_column->columnType;
645  if (auto time_logical_column = dynamic_cast<const parquet::TimeLogicalType*>(
646  parquet_column->logical_type().get())) {
647  if (column_type.get_compression() == kENCODING_NONE) {
648  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
649  return create_parquet_time_encoder_with_types<int64_t, int32_t, int64_t>(
650  omnisci_column, parquet_column, buffer);
651  } else {
652  return create_parquet_time_encoder_with_types<int64_t, int64_t, int64_t>(
653  omnisci_column, parquet_column, buffer);
654  }
655  } else if (column_type.get_compression() == kENCODING_FIXED) {
656  if (is_metadata_scan_or_for_import) {
657  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
658  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
659  return create_parquet_time_encoder_with_types<int64_t, int32_t, int32_t>(
660  omnisci_column, parquet_column, buffer);
661  } else {
662  CHECK(time_logical_column->time_unit() ==
663  parquet::LogicalType::TimeUnit::MICROS ||
664  time_logical_column->time_unit() ==
665  parquet::LogicalType::TimeUnit::NANOS);
666  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
667  return create_parquet_time_encoder_with_types<int64_t, int64_t, int32_t>(
668  omnisci_column, parquet_column, buffer);
669  }
670  } else {
671  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
672  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
673  return create_parquet_time_encoder_with_types<int32_t, int32_t, int32_t>(
674  omnisci_column, parquet_column, buffer);
675  } else {
676  CHECK(time_logical_column->time_unit() ==
677  parquet::LogicalType::TimeUnit::MICROS ||
678  time_logical_column->time_unit() ==
679  parquet::LogicalType::TimeUnit::NANOS);
680  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
681  return create_parquet_time_encoder_with_types<int32_t, int64_t, int32_t>(
682  omnisci_column, parquet_column, buffer);
683  }
684  }
685  } else {
686  UNREACHABLE();
687  }
688  }
689  return {};
690 }
#define UNREACHABLE()
Definition: Logger.h:267
#define CHECK(condition)
Definition: Logger.h:223
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 613 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

616  {
617  if (auto time_logical_type = dynamic_cast<const parquet::TimeLogicalType*>(
618  parquet_column->logical_type().get())) {
619  switch (time_logical_type->time_unit()) {
620  case parquet::LogicalType::TimeUnit::MILLIS:
621  return std::make_shared<ParquetTimeEncoder<V, T, 1000L, NullType>>(
622  buffer, omnisci_column, parquet_column);
623  case parquet::LogicalType::TimeUnit::MICROS:
624  return std::make_shared<ParquetTimeEncoder<V, T, 1000L * 1000L, NullType>>(
625  buffer, omnisci_column, parquet_column);
626  case parquet::LogicalType::TimeUnit::NANOS:
627  return std::make_shared<
628  ParquetTimeEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
629  buffer, omnisci_column, parquet_column);
630  default:
631  UNREACHABLE();
632  }
633  } else {
634  UNREACHABLE();
635  }
636  return {};
637 }
#define UNREACHABLE()
Definition: Logger.h:267
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 554 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_precision(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

558  {
559  auto column_type = omnisci_column->columnType;
560  auto precision = column_type.get_precision();
561  if (parquet_column->logical_type()->is_timestamp()) {
562  if (column_type.get_compression() == kENCODING_NONE) {
563  if (precision == 0) {
564  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int64_t>(
565  omnisci_column, parquet_column, buffer);
566  } else {
567  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
568  buffer, omnisci_column, parquet_column);
569  }
570  } else if (column_type.get_compression() == kENCODING_FIXED) {
571  CHECK(column_type.get_comp_param() == 32);
572  if (is_metadata_scan_or_for_import) {
573  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int32_t>(
574  omnisci_column, parquet_column, buffer);
575  } else {
576  return create_parquet_timestamp_encoder_with_types<int32_t, int64_t, int32_t>(
577  omnisci_column, parquet_column, buffer);
578  }
579  }
580  } else if (parquet_column->logical_type()->is_none() && column_type.is_timestamp()) {
581  if (parquet_column->physical_type() == parquet::Type::INT32) {
582  CHECK(column_type.get_compression() == kENCODING_FIXED &&
583  column_type.get_comp_param() == 32);
584  if (is_metadata_scan_or_for_import) {
585  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int32_t, int32_t>>(
586  buffer, omnisci_column, parquet_column);
587  } else {
588  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t, int32_t>>(
589  buffer, omnisci_column, parquet_column);
590  }
591  } else if (parquet_column->physical_type() == parquet::Type::INT64) {
592  if (column_type.get_compression() == kENCODING_NONE) {
593  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
594  buffer, omnisci_column, parquet_column);
595  } else if (column_type.get_compression() == kENCODING_FIXED) {
596  CHECK(column_type.get_comp_param() == 32);
597  if (is_metadata_scan_or_for_import) {
598  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int32_t>>(
599  buffer, omnisci_column, parquet_column);
600  } else {
601  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int64_t, int32_t>>(
602  buffer, omnisci_column, parquet_column);
603  }
604  }
605  } else {
606  UNREACHABLE();
607  }
608  }
609  return {};
610 }
#define UNREACHABLE()
Definition: Logger.h:267
int get_precision() const
Definition: sqltypes.h:332
#define CHECK(condition)
Definition: Logger.h:223
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 480 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

483  {
484  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
485  parquet_column->logical_type().get())) {
486  switch (timestamp_logical_type->time_unit()) {
487  case parquet::LogicalType::TimeUnit::MILLIS:
488  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L, NullType>>(
489  buffer, omnisci_column, parquet_column);
490  case parquet::LogicalType::TimeUnit::MICROS:
491  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
492  buffer, omnisci_column, parquet_column);
493  case parquet::LogicalType::TimeUnit::NANOS:
494  return std::make_shared<
495  ParquetTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
496  buffer, omnisci_column, parquet_column);
497  default:
498  UNREACHABLE();
499  }
500  } else {
501  UNREACHABLE();
502  }
503  return {};
504 }
#define UNREACHABLE()
Definition: Logger.h:267
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1296 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1296  {
1297  return omnisci_column->columnType.get_dimension() == 6;
1298 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:331
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1300 of file LazyParquetChunkLoader.cpp.

1301  {
1302  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MICROS;
1303 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1305 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1305  {
1306  return omnisci_column->columnType.get_dimension() == 3;
1307 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:331
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1309 of file LazyParquetChunkLoader.cpp.

1310  {
1311  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS;
1312 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1287 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1287  {
1288  return omnisci_column->columnType.get_dimension() == 9;
1289 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:331
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1291 of file LazyParquetChunkLoader.cpp.

1292  {
1293  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::NANOS;
1294 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_list_column ( const parquet::ColumnDescriptor *  parquet_column)

Detect a valid list parquet column.

Parameters
parquet_column- the parquet column descriptor of the column to detect
Returns
true if it is a valid parquet list column

Note: the notion of a valid parquet list column is adapted from the parquet schema specification for logical type definitions:

<list-repetition> group <name> (LIST) { repeated group list { <element-repetition> <element-type> element; } }

Testing has shown that there are small deviations from this specification in at least one library– pyarrow– where the innermost schema node is named "item" as opposed to "element".

The following is also true of the schema definition.

  • The outer-most level must be a group annotated with LIST that contains a single field named list. The repetition of this level must be either optional or required and determines whether the list is nullable.
  • The middle level, named list, must be a repeated group with a single field named element.
  • The element field encodes the list's element type and repetition. Element repetition must be required or optional.

FSI further restricts lists to be defined only at the top level, meaning directly below the root schema node.

Definition at line 100 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_array_encoder(), foreign_storage::LazyParquetChunkLoader::suggestColumnMapping(), validate_array_mapping(), validate_column_mapping_and_row_group_metadata(), validate_definition_levels(), and validate_max_repetition_and_definition_level().

100  {
101  const parquet::schema::Node* node = parquet_column->schema_node().get();
102  if ((node->name() != "element" && node->name() != "item") ||
103  !(node->is_required() ||
104  node->is_optional())) { // ensure first innermost node is named "element"
105  // which is required by the parquet specification;
106  // however testing shows that pyarrow generates this
107  // column with the name of "item"
108  // this field must be either required or optional
109  return false;
110  }
111  node = node->parent();
112  if (!node) { // required nested structure
113  return false;
114  }
115  if (node->name() != "list" || !node->is_repeated() ||
116  !node->is_group()) { // ensure second innermost node is named "list" which is
117  // a repeated group; this is
118  // required by the parquet specification
119  return false;
120  }
121  node = node->parent();
122  if (!node) { // required nested structure
123  return false;
124  }
125  if (!node->logical_type()->is_list() ||
126  !(node->is_optional() ||
127  node->is_required())) { // ensure third outermost node has logical type LIST
128  // which is either optional or required; this is required
129  // by the parquet specification
130  return false;
131  }
132  node =
133  node->parent(); // this must now be the root node of schema which is required by
134  // FSI (lists can not be embedded into a deeper nested structure)
135  if (!node) { // required nested structure
136  return false;
137  }
138  node = node->parent();
139  if (node) { // implies the previous node was not the root node
140  return false;
141  }
142  return true;
143 }

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_string ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 58 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_geospatial_encoder(), create_parquet_string_encoder(), suggest_column_scalar_type(), suggest_string_mapping(), validate_geospatial_mapping(), and validate_string_mapping().

58  {
59  return (parquet_column->logical_type()->is_none() &&
60  parquet_column->physical_type() == parquet::Type::BYTE_ARRAY) ||
61  parquet_column->logical_type()->is_string();
62 }

+ Here is the caller graph for this function:

std::list<RowGroupMetadata> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::metadata_scan_rowgroup_interval ( const std::map< int, std::shared_ptr< ParquetEncoder >> &  encoder_map,
const RowGroupInterval &  row_group_interval,
const ReaderPtr &  reader,
const ForeignTableSchema &  schema 
)

Definition at line 1674 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnId, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getLogicalAndPhysicalColumns(), foreign_storage::ForeignTableSchema::getLogicalColumn(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::RowGroupInterval::start_index.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1678  {
1679  std::list<RowGroupMetadata> row_group_metadata;
1680  auto column_interval =
1681  Interval<ColumnType>{schema.getLogicalAndPhysicalColumns().front()->columnId,
1682  schema.getLogicalAndPhysicalColumns().back()->columnId};
1683 
1684  auto file_metadata = reader->parquet_reader()->metadata();
1685  for (int row_group = row_group_interval.start_index;
1686  row_group <= row_group_interval.end_index;
1687  ++row_group) {
1688  auto& row_group_metadata_item = row_group_metadata.emplace_back();
1689  row_group_metadata_item.row_group_index = row_group;
1690  row_group_metadata_item.file_path = row_group_interval.file_path;
1691 
1692  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1693  file_metadata->RowGroup(row_group);
1694 
1695  for (int column_id = column_interval.start; column_id <= column_interval.end;
1696  column_id++) {
1697  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1698  auto parquet_column_index = schema.getParquetColumnIndex(column_id);
1699  auto encoder_map_iter =
1700  encoder_map.find(schema.getLogicalColumn(column_id)->columnId);
1701  CHECK(encoder_map_iter != encoder_map.end());
1702  try {
1703  auto metadata = encoder_map_iter->second->getRowGroupMetadata(
1704  group_metadata.get(), parquet_column_index, column_descriptor->columnType);
1705  row_group_metadata_item.column_chunk_metadata.emplace_back(metadata);
1706  } catch (const std::exception& e) {
1707  std::stringstream error_message;
1708  error_message << e.what() << " in row group " << row_group << " of Parquet file '"
1709  << row_group_interval.file_path << "'.";
1710  throw std::runtime_error(error_message.str());
1711  }
1712  }
1713  }
1714  return row_group_metadata;
1715 }
#define CHECK(condition)
Definition: Logger.h:223

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_import ( const std::map< int, Chunk_NS::Chunk chunks,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const std::map< int, StringDictionary * >  column_dictionaries,
const int64_t  num_rows,
const RenderGroupAnalyzerMap render_group_analyzer_map 
)

Definition at line 1717 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_import(), shared::get_from_map(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), and foreign_storage::ForeignTableSchema::getParquetColumnIndex().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups().

1723  {
1724  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1725  auto file_metadata = reader->parquet_reader()->metadata();
1726  for (auto& [column_id, chunk] : chunks) {
1727  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1728  if (column_descriptor->isGeoPhyCol) { // skip physical columns
1729  continue;
1730  }
1731  auto parquet_column_descriptor =
1732  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1733  auto find_it = column_dictionaries.find(column_id);
1734  StringDictionary* dictionary =
1735  (find_it == column_dictionaries.end() ? nullptr : find_it->second);
1736  std::list<Chunk_NS::Chunk> chunks_for_import;
1737  chunks_for_import.push_back(chunk);
1738  if (column_descriptor->columnType.is_geometry()) {
1739  for (int i = 0; i < column_descriptor->columnType.get_physical_cols(); ++i) {
1740  chunks_for_import.push_back(chunks.at(column_id + i + 1));
1741  }
1742  }
1743  encoder_map[column_id] = create_parquet_encoder_for_import(chunks_for_import,
1744  column_descriptor,
1745  parquet_column_descriptor,
1746  dictionary,
1747  render_group_analyzer_map);
1748 
1749  // reserve space in buffer when num-elements known ahead of time for types
1750  // of known size (for example dictionary encoded strings)
1751  auto encoder = shared::get_from_map(encoder_map, column_id);
1752  if (auto inplace_encoder = dynamic_cast<ParquetInPlaceEncoder*>(encoder.get())) {
1753  inplace_encoder->reserve(num_rows);
1754  }
1755  }
1756  return encoder_map;
1757 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_import(std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary, const RenderGroupAnalyzerMap *render_group_analyzer_map)
V & get_from_map(std::map< K, V, comp > &map, const K &key)
Definition: misc.h:62

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_metadata_scan ( const Interval< ColumnType > &  column_interval,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const RenderGroupAnalyzerMap render_group_analyzer_map,
const bool  do_metadata_stats_validation 
)

Definition at line 1759 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_metadata_scan(), foreign_storage::Interval< T >::end, shared::get_from_map(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::Interval< T >::start.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1764  {
1765  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1766  auto file_metadata = reader->parquet_reader()->metadata();
1767  for (int column_id = column_interval.start; column_id <= column_interval.end;
1768  column_id++) {
1769  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1770  auto parquet_column_descriptor =
1771  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1772  encoder_map[column_id] = create_parquet_encoder_for_metadata_scan(
1773  column_descriptor, parquet_column_descriptor, render_group_analyzer_map);
1774  if (!do_metadata_stats_validation) {
1775  shared::get_from_map(encoder_map, column_id)->disableMetadataStatsValidation();
1776  }
1777  column_id += column_descriptor->columnType.get_physical_cols();
1778  }
1779  return encoder_map;
1780 }
T const end
Definition: Intervals.h:68
V & get_from_map(std::map< K, V, comp > &map, const K &key)
Definition: misc.h:62
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_metadata_scan(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const RenderGroupAnalyzerMap *render_group_analyzer_map)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::resize_values_buffer ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::vector< int8_t > &  values 
)

Definition at line 1135 of file LazyParquetChunkLoader.cpp.

References foreign_storage::LazyParquetChunkLoader::batch_reader_num_elements, ColumnDescriptor::columnType, and SQLTypeInfo::get_size().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1137  {
1138  auto max_type_byte_size =
1139  std::max(omnisci_column->columnType.get_size(),
1140  parquet::GetTypeByteSize(parquet_column->physical_type()));
1141  size_t values_size =
1142  LazyParquetChunkLoader::batch_reader_num_elements * max_type_byte_size;
1143  values.resize(values_size);
1144 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_boolean_type_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1323 of file LazyParquetChunkLoader.cpp.

References kBOOLEAN, kENCODING_NONE, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1324  {
1325  SQLTypeInfo type;
1327  type.set_type(kBOOLEAN);
1328  type.set_fixed_size();
1329  return type;
1330 }
void set_compression(EncodingType c)
Definition: sqltypes.h:440
void set_fixed_size()
Definition: sqltypes.h:438
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:429

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_column_scalar_type ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1528 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_string(), suggest_boolean_type_mapping(), suggest_date_mapping(), suggest_decimal_mapping(), suggest_floating_point_mapping(), suggest_integral_mapping(), suggest_string_mapping(), suggest_time_mapping(), and suggest_timestamp_mapping().

Referenced by foreign_storage::LazyParquetChunkLoader::suggestColumnMapping().

1528  {
1529  // decimal case
1530  if (parquet_column->logical_type()->is_decimal()) {
1531  return suggest_decimal_mapping(parquet_column);
1532  }
1533  // float case
1534  if (parquet_column->logical_type()->is_none() &&
1535  (parquet_column->physical_type() == parquet::Type::FLOAT ||
1536  parquet_column->physical_type() == parquet::Type::DOUBLE)) {
1537  return suggest_floating_point_mapping(parquet_column);
1538  }
1539  // integral case
1540  if ((parquet_column->logical_type()->is_none() &&
1541  (parquet_column->physical_type() == parquet::Type::INT32 ||
1542  parquet_column->physical_type() == parquet::Type::INT64)) ||
1543  parquet_column->logical_type()->is_int()) {
1544  return suggest_integral_mapping(parquet_column);
1545  }
1546  // boolean case
1547  if (parquet_column->logical_type()->is_none() &&
1548  parquet_column->physical_type() == parquet::Type::BOOLEAN) {
1549  return suggest_boolean_type_mapping(parquet_column);
1550  }
1551  // timestamp case
1552  if (parquet_column->logical_type()->is_timestamp()) {
1553  return suggest_timestamp_mapping(parquet_column);
1554  }
1555  // time case
1556  if (parquet_column->logical_type()->is_time()) {
1557  return suggest_time_mapping(parquet_column);
1558  }
1559  // date case
1560  if (parquet_column->logical_type()->is_date()) {
1561  return suggest_date_mapping(parquet_column);
1562  }
1563  // string case
1564  if (is_valid_parquet_string(parquet_column)) {
1565  return suggest_string_mapping(parquet_column);
1566  }
1567 
1568  throw ForeignStorageException("Unsupported data type detected for column: " +
1569  parquet_column->ToString());
1570 }
SQLTypeInfo suggest_decimal_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_timestamp_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_string_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_date_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_floating_point_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_integral_mapping(const parquet::ColumnDescriptor *parquet_column)
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_boolean_type_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_time_mapping(const parquet::ColumnDescriptor *parquet_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_date_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1427 of file LazyParquetChunkLoader.cpp.

References CHECK, kDATE, kENCODING_NONE, and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1427  {
1428  CHECK(parquet_column->logical_type()->is_date());
1429  SQLTypeInfo type;
1430  type.set_type(kDATE);
1431  type.set_compression(kENCODING_NONE);
1432  type.set_fixed_size();
1433  return type;
1434 }
Definition: sqltypes.h:53
#define CHECK(condition)
Definition: Logger.h:223

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_decimal_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1160 of file LazyParquetChunkLoader.cpp.

References kDECIMAL, kENCODING_NONE, SQLTypeInfo::scale, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_precision(), SQLTypeInfo::set_scale(), SQLTypeInfo::set_type(), to_string(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1160  {
1161  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1162  parquet_column->logical_type().get())) {
1163  auto parquet_precision = decimal_logical_column->precision();
1164  auto parquet_scale = decimal_logical_column->scale();
1165  if (parquet_precision > 18) {
1166  throw ForeignStorageException(
1167  "Parquet column \"" + parquet_column->ToString() +
1168  "\" has decimal precision of " + std::to_string(parquet_precision) +
1169  " which is too high to import, maximum precision supported is 18.");
1170  }
1171  SQLTypeInfo type;
1172  type.set_type(kDECIMAL);
1174  type.set_precision(parquet_precision);
1175  type.set_scale(parquet_scale);
1176  type.set_fixed_size();
1177  return type;
1178  }
1179  UNREACHABLE()
1180  << " a Parquet column's decimal logical type failed to be read appropriately";
1181  return {};
1182 }
void set_compression(EncodingType c)
Definition: sqltypes.h:440
#define UNREACHABLE()
Definition: Logger.h:267
std::string to_string(char const *&&v)
void set_fixed_size()
Definition: sqltypes.h:438
void set_scale(int s)
Definition: sqltypes.h:434
void set_precision(int d)
Definition: sqltypes.h:432
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:429

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_floating_point_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1200 of file LazyParquetChunkLoader.cpp.

References kDOUBLE, kENCODING_NONE, kFLOAT, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1201  {
1202  SQLTypeInfo type;
1203  if (parquet_column->physical_type() == parquet::Type::FLOAT) {
1204  type.set_type(kFLOAT);
1205  } else if (parquet_column->physical_type() == parquet::Type::DOUBLE) {
1206  type.set_type(kDOUBLE);
1207  } else {
1208  UNREACHABLE();
1209  }
1211  type.set_fixed_size();
1212  return type;
1213 }
void set_compression(EncodingType c)
Definition: sqltypes.h:440
#define UNREACHABLE()
Definition: Logger.h:267
void set_fixed_size()
Definition: sqltypes.h:438
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:429

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_integral_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1242 of file LazyParquetChunkLoader.cpp.

References CHECK, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), run_benchmark_import::type, and within_range().

Referenced by suggest_column_scalar_type().

1242  {
1243  SQLTypeInfo type;
1245  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1246  parquet_column->logical_type().get())) {
1247  auto bit_width = int_logical_column->bit_width();
1248  if (!int_logical_column->is_signed()) {
1249  if (within_range(33, 64, bit_width)) {
1250  throw ForeignStorageException(
1251  "Unsigned integer column \"" + parquet_column->name() +
1252  "\" in Parquet file with 64 bit-width has no supported type for ingestion "
1253  "that will not result in data loss");
1254  } else if (within_range(17, 32, bit_width)) {
1255  type.set_type(kBIGINT);
1256  } else if (within_range(9, 16, bit_width)) {
1257  type.set_type(kINT);
1258  } else if (within_range(0, 8, bit_width)) {
1259  type.set_type(kSMALLINT);
1260  }
1261  } else {
1262  if (within_range(33, 64, bit_width)) {
1263  type.set_type(kBIGINT);
1264  } else if (within_range(17, 32, bit_width)) {
1265  type.set_type(kINT);
1266  } else if (within_range(9, 16, bit_width)) {
1267  type.set_type(kSMALLINT);
1268  } else if (within_range(0, 8, bit_width)) {
1269  type.set_type(kTINYINT);
1270  }
1271  }
1272  type.set_fixed_size();
1273  return type;
1274  }
1275 
1276  CHECK(parquet_column->logical_type()->is_none());
1277  if (parquet_column->physical_type() == parquet::Type::INT32) {
1278  type.set_type(kINT);
1279  } else {
1280  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
1281  type.set_type(kBIGINT);
1282  }
1283  type.set_fixed_size();
1284  return type;
1285 }
void set_compression(EncodingType c)
Definition: sqltypes.h:440
void set_fixed_size()
Definition: sqltypes.h:438
#define CHECK(condition)
Definition: Logger.h:223
Definition: sqltypes.h:45
bool within_range(int64_t lower_bound, int64_t upper_bound, int64_t value)
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:429

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_string_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1444 of file LazyParquetChunkLoader.cpp.

References CHECK, is_valid_parquet_string(), kENCODING_DICT, kTEXT, SQLTypeInfo::set_comp_param(), SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1444  {
1445  CHECK(is_valid_parquet_string(parquet_column));
1446  SQLTypeInfo type;
1447  type.set_type(kTEXT);
1449  type.set_comp_param(32);
1450  type.set_fixed_size();
1451  return type;
1452 }
void set_compression(EncodingType c)
Definition: sqltypes.h:440
void set_fixed_size()
Definition: sqltypes.h:438
void set_comp_param(int p)
Definition: sqltypes.h:441
Definition: sqltypes.h:52
#define CHECK(condition)
Definition: Logger.h:223
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:429

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_time_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1401 of file LazyParquetChunkLoader.cpp.

References CHECK, kENCODING_NONE, kTIME, and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1401  {
1402  CHECK(parquet_column->logical_type()->is_time());
1403  SQLTypeInfo type;
1404  type.set_type(kTIME);
1405  type.set_compression(kENCODING_NONE);
1406  type.set_fixed_size();
1407  return type;
1408 }
Definition: sqltypes.h:49
#define CHECK(condition)
Definition: Logger.h:223

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_timestamp_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1367 of file LazyParquetChunkLoader.cpp.

References is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_NONE, kTIMESTAMP, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_precision(), SQLTypeInfo::set_type(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1367  {
1368  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1369  parquet_column->logical_type().get())) {
1370  SQLTypeInfo type;
1371  type.set_type(kTIMESTAMP);
1373  if (is_nanosecond_precision(timestamp_logical_column)) {
1374  type.set_precision(9);
1375  } else if (is_microsecond_precision(timestamp_logical_column)) {
1376  type.set_precision(6);
1377  } else if (is_millisecond_precision(timestamp_logical_column)) {
1378  type.set_precision(3);
1379  }
1380  type.set_fixed_size();
1381  return type;
1382  }
1383  UNREACHABLE();
1384  return {};
1385 }
void set_compression(EncodingType c)
Definition: sqltypes.h:440
#define UNREACHABLE()
Definition: Logger.h:267
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
void set_fixed_size()
Definition: sqltypes.h:438
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
void set_precision(int d)
Definition: sqltypes.h:432
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:429

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_missing_metadata_error ( const int  row_group_index,
const int  column_index,
const std::string &  file_path 
)

Definition at line 1582 of file LazyParquetChunkLoader.cpp.

References to_string().

Referenced by validate_column_mapping_and_row_group_metadata().

1584  {
1585  throw std::runtime_error{
1586  "Statistics metadata is required for all row groups. Metadata is missing for "
1587  "row group index: " +
1588  std::to_string(row_group_index) +
1589  ", column index: " + std::to_string(column_index) + ", file path: " + file_path};
1590 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_row_group_larger_than_fragment_size_error ( const MaxRowGroupSizeStats  max_row_group_stats,
const int  fragment_size 
)

Definition at line 1598 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::file_path, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::max_row_group_index, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::max_row_group_size, and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1600  {
1601  auto metadata_scan_exception = MetadataScanInfeasibleFragmentSizeException{
1602  "Parquet file has a row group size that is larger than the fragment size. "
1603  "Please set the table fragment size to a number that is larger than the "
1604  "row group size. Row group index: " +
1605  std::to_string(max_row_group_stats.max_row_group_index) +
1606  ", row group size: " + std::to_string(max_row_group_stats.max_row_group_size) +
1607  ", fragment size: " + std::to_string(fragment_size) +
1608  ", file path: " + max_row_group_stats.file_path};
1609  metadata_scan_exception.min_feasible_fragment_size_ =
1610  max_row_group_stats.max_row_group_size;
1611  throw metadata_scan_exception;
1612 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping ( const parquet::ColumnDescriptor *  parquet_column,
const ColumnDescriptor omnisci_column 
)

Definition at line 1497 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_type_name(), foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported(), LOG, run_benchmark_import::type, and logger::WARNING.

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_column_mapping_and_row_group_metadata().

1498  {
1499  parquet::Type::type physical_type = parquet_column->physical_type();
1500  auto logical_type = parquet_column->logical_type();
1501  bool allowed_type =
1502  LazyParquetChunkLoader::isColumnMappingSupported(omnisci_column, parquet_column);
1503  if (!allowed_type) {
1504  if (logical_type->is_timestamp()) {
1505  auto timestamp_type =
1506  dynamic_cast<const parquet::TimestampLogicalType*>(logical_type.get());
1507  CHECK(timestamp_type);
1508 
1509  if (!timestamp_type->is_adjusted_to_utc()) {
1510  LOG(WARNING) << "Non-UTC timezone specified in Parquet file for column \""
1511  << omnisci_column->columnName
1512  << "\". Only UTC timezone is currently supported.";
1513  }
1514  }
1515  std::string parquet_type;
1516  if (parquet_column->logical_type()->is_none()) {
1517  parquet_type = parquet::TypeToString(physical_type);
1518  } else {
1519  parquet_type = logical_type->ToString();
1520  }
1521  std::string omnisci_type = omnisci_column->columnType.get_type_name();
1522  throw std::runtime_error{"Conversion from Parquet type \"" + parquet_type +
1523  "\" to HeavyDB type \"" + omnisci_type +
1524  "\" is not allowed. Please use an appropriate column type."};
1525  }
1526 }
#define LOG(tag)
Definition: Logger.h:217
std::string get_type_name() const
Definition: sqltypes.h:443
#define CHECK(condition)
Definition: Logger.h:223
SQLTypeInfo columnType
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1454 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::is_array(), is_valid_parquet_list_column(), and foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1455  {
1456  if (is_valid_parquet_list_column(parquet_column) &&
1457  omnisci_column->columnType.is_array()) {
1458  auto omnisci_column_sub_type_column = get_sub_type_column_descriptor(omnisci_column);
1459  return LazyParquetChunkLoader::isColumnMappingSupported(
1460  omnisci_column_sub_type_column.get(), parquet_column);
1461  }
1462  return false;
1463 }
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
SQLTypeInfo columnType
bool is_array() const
Definition: sqltypes.h:518

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

MaxRowGroupSizeStats foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_column_mapping_and_row_group_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1614 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::getLogicalColumns(), is_valid_parquet_list_column(), throw_missing_metadata_error(), and validate_allowed_mapping().

Referenced by validate_parquet_metadata().

1617  {
1618  auto column_it = schema.getLogicalColumns().begin();
1619  MaxRowGroupSizeStats max_row_group_stats{0, 0};
1620  for (int i = 0; i < file_metadata->num_columns(); ++i, ++column_it) {
1621  const parquet::ColumnDescriptor* descr = file_metadata->schema()->Column(i);
1622  try {
1623  validate_allowed_mapping(descr, *column_it);
1624  } catch (std::runtime_error& e) {
1625  std::stringstream error_message;
1626  error_message << e.what() << " Parquet column: " << descr->name()
1627  << ", HeavyDB column: " << (*column_it)->columnName
1628  << ", Parquet file: " << file_path << ".";
1629  throw std::runtime_error(error_message.str());
1630  }
1631 
1632  for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
1633  auto group_metadata = file_metadata->RowGroup(r);
1634  auto num_rows = group_metadata->num_rows();
1635  if (num_rows == 0) {
1636  continue;
1637  } else if (num_rows > max_row_group_stats.max_row_group_size) {
1638  max_row_group_stats.max_row_group_size = num_rows;
1639  max_row_group_stats.max_row_group_index = r;
1640  max_row_group_stats.file_path = file_path;
1641  }
1642 
1643  auto column_chunk = group_metadata->ColumnChunk(i);
1644  bool contains_metadata = column_chunk->is_stats_set();
1645  if (contains_metadata) {
1646  auto stats = column_chunk->statistics();
1647  bool is_all_nulls = stats->null_count() == column_chunk->num_values();
1648  bool is_list = is_valid_parquet_list_column(file_metadata->schema()->Column(i));
1649  // Given a list, it is possible it has no min or max if it is comprised
1650  // only of empty lists & nulls. This can not be detected by comparing
1651  // the null count; therefore we afford list types the benefit of the
1652  // doubt in this situation.
1653  if (!(stats->HasMinMax() || is_all_nulls || is_list)) {
1654  contains_metadata = false;
1655  }
1656  }
1657 
1658  if (!contains_metadata) {
1659  throw_missing_metadata_error(r, i, file_path);
1660  }
1661  }
1662  }
1663  return max_row_group_stats;
1664 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
void throw_missing_metadata_error(const int row_group_index, const int column_index, const std::string &file_path)
void validate_allowed_mapping(const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_date_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1410 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kDATE, kENCODING_DATE_IN_DAYS, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1411  {
1412  if (!(omnisci_column->columnType.get_type() == kDATE &&
1413  ((omnisci_column->columnType.get_compression() == kENCODING_DATE_IN_DAYS &&
1414  (omnisci_column->columnType.get_comp_param() ==
1415  0 // DATE ENCODING DAYS (32) specifies comp_param of 0
1416  || omnisci_column->columnType.get_comp_param() == 16)) ||
1417  omnisci_column->columnType.get_compression() ==
1418  kENCODING_NONE // for array types
1419  ))) {
1420  return false;
1421  }
1422  return parquet_column->logical_type()->is_date() ||
1423  parquet_column->logical_type()
1424  ->is_timestamp(); // to support TIMESTAMP -> DATE coercion
1425 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
Definition: sqltypes.h:53
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_decimal_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1146 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_precision(), SQLTypeInfo::get_scale(), SQLTypeInfo::is_decimal(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1147  {
1148  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1149  parquet_column->logical_type().get())) {
1150  return omnisci_column->columnType.get_precision() ==
1151  decimal_logical_column->precision() &&
1152  omnisci_column->columnType.get_scale() == decimal_logical_column->scale() &&
1153  omnisci_column->columnType.is_decimal() &&
1154  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1155  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1156  }
1157  return false;
1158 }
HOST DEVICE int get_scale() const
Definition: sqltypes.h:334
int get_precision() const
Definition: sqltypes.h:332
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
SQLTypeInfo columnType
bool is_decimal() const
Definition: sqltypes.h:513

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_definition_levels ( const parquet::ParquetFileReader *  reader,
const int  row_group_index,
const int  column_index,
const int16_t *  def_levels,
const int64_t  num_levels,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1069 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_list_column(), and foreign_storage::validate_and_get_column_metadata_statistics().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1075  {
1076  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1077  if (!is_valid_parquet_list) {
1078  return;
1079  }
1080  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1081  reader->metadata()->RowGroup(row_group_index);
1082  auto column_metadata = group_metadata->ColumnChunk(column_index);
1083  auto stats = validate_and_get_column_metadata_statistics(column_metadata.get());
1084  if (!stats->HasMinMax()) {
1085  auto find_it = std::find_if(def_levels,
1086  def_levels + num_levels,
1087  [](const int16_t def_level) { return def_level == 3; });
1088  if (find_it != def_levels + num_levels) {
1089  throw std::runtime_error(
1090  "No minimum and maximum statistic set in list column but non-null & non-empty "
1091  "array/value detected.");
1092  }
1093  }
1094 }
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema ( const parquet::arrow::FileReader *  reference_file_reader,
const parquet::arrow::FileReader *  new_file_reader,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 1471 of file LazyParquetChunkLoader.cpp.

References foreign_storage::get_column_descriptor(), to_string(), and foreign_storage::validate_equal_column_descriptor().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan(), and foreign_storage::LazyParquetChunkLoader::previewFiles().

1474  {
1475  const auto reference_num_columns =
1476  reference_file_reader->parquet_reader()->metadata()->num_columns();
1477  const auto new_num_columns =
1478  new_file_reader->parquet_reader()->metadata()->num_columns();
1479  if (reference_num_columns != new_num_columns) {
1480  throw std::runtime_error{"Parquet file \"" + new_file_path +
1481  "\" has a different schema. Please ensure that all Parquet "
1482  "files use the same schema. Reference Parquet file: \"" +
1483  reference_file_path + "\" has " +
1484  std::to_string(reference_num_columns) +
1485  " columns. New Parquet file \"" + new_file_path + "\" has " +
1486  std::to_string(new_num_columns) + " columns."};
1487  }
1488 
1489  for (int i = 0; i < reference_num_columns; i++) {
1490  validate_equal_column_descriptor(get_column_descriptor(reference_file_reader, i),
1491  get_column_descriptor(new_file_reader, i),
1492  reference_file_path,
1493  new_file_path);
1494  }
1495 }
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
std::string to_string(char const *&&v)
const ColumnDescriptor * get_column_descriptor(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:191

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_floating_point_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1184 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), SQLTypeInfo::is_fp(), kENCODING_NONE, and kFLOAT.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1185  {
1186  if (!omnisci_column->columnType.is_fp()) {
1187  return false;
1188  }
1189  // check if mapping is a valid coerced or non-coerced floating point mapping
1190  // with no annotation (floating point columns have no annotation in the
1191  // Parquet specification)
1192  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1193  return (parquet_column->physical_type() == parquet::Type::DOUBLE) ||
1194  (parquet_column->physical_type() == parquet::Type::FLOAT &&
1195  omnisci_column->columnType.get_type() == kFLOAT);
1196  }
1197  return false;
1198 }
bool is_fp() const
Definition: sqltypes.h:514
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_geospatial_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1465 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_geometry(), and is_valid_parquet_string().

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1466  {
1467  return is_valid_parquet_string(parquet_column) &&
1468  omnisci_column->columnType.is_geometry();
1469 }
bool is_geometry() const
Definition: sqltypes.h:522
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_integral_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1215 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_size(), SQLTypeInfo::is_integer(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1216  {
1217  if (!omnisci_column->columnType.is_integer()) {
1218  return false;
1219  }
1220  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1221  parquet_column->logical_type().get())) {
1222  CHECK(omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1223  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1224  const int bits_per_byte = 8;
1225  // unsigned types are permitted to map to a wider integral type in order to avoid
1226  // precision loss
1227  const int bit_widening_factor = int_logical_column->is_signed() ? 1 : 2;
1228  return omnisci_column->columnType.get_size() * bits_per_byte <=
1229  int_logical_column->bit_width() * bit_widening_factor;
1230  }
1231  // check if mapping is a valid coerced or non-coerced integral mapping with no
1232  // annotation
1233  if ((omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1234  omnisci_column->columnType.get_compression() == kENCODING_FIXED)) {
1235  return (parquet_column->physical_type() == parquet::Type::INT64) ||
1236  (parquet_column->physical_type() == parquet::Type::INT32 &&
1237  omnisci_column->columnType.get_size() <= 4);
1238  }
1239  return false;
1240 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
bool is_integer() const
Definition: sqltypes.h:512
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
#define CHECK(condition)
Definition: Logger.h:223
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_max_repetition_and_definition_level ( const ColumnDescriptor omnisci_column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1096 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::is_array(), is_valid_parquet_list_column(), and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::LazyParquetChunkLoader::loadRowGroups().

1098  {
1099  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1100  if (is_valid_parquet_list && !omnisci_column_descriptor->columnType.is_array()) {
1101  throw std::runtime_error(
1102  "Unsupported mapping detected. Column '" + parquet_column_descriptor->name() +
1103  "' detected to be a parquet list but HeavyDB mapped column '" +
1104  omnisci_column_descriptor->columnName + "' is not an array.");
1105  }
1106  if (is_valid_parquet_list) {
1107  if (parquet_column_descriptor->max_repetition_level() != 1 ||
1108  parquet_column_descriptor->max_definition_level() != 3) {
1109  throw std::runtime_error(
1110  "Incorrect schema max repetition level detected in column '" +
1111  parquet_column_descriptor->name() +
1112  "'. Expected a max repetition level of 1 and max definition level of 3 for "
1113  "list column but column has a max "
1114  "repetition level of " +
1115  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1116  " and a max definition level of " +
1117  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1118  }
1119  } else {
1120  if (parquet_column_descriptor->max_repetition_level() != 0 ||
1121  parquet_column_descriptor->max_definition_level() != 1) {
1122  throw std::runtime_error(
1123  "Incorrect schema max repetition level detected in column '" +
1124  parquet_column_descriptor->name() +
1125  "'. Expected a max repetition level of 0 and max definition level of 1 for "
1126  "flat column but column has a max "
1127  "repetition level of " +
1128  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1129  " and a max definition level of " +
1130  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1131  }
1132  }
1133 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::string to_string(char const *&&v)
SQLTypeInfo columnType
std::string columnName
bool is_array() const
Definition: sqltypes.h:518

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_none_type_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1314 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kBOOLEAN, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1315  {
1316  bool is_none_encoded_mapping =
1317  omnisci_column->columnType.get_compression() == kENCODING_NONE &&
1318  (parquet_column->physical_type() == parquet::Type::BOOLEAN &&
1319  omnisci_column->columnType.get_type() == kBOOLEAN);
1320  return parquet_column->logical_type()->is_none() && is_none_encoded_mapping;
1321 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1572 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::numLogicalColumns(), and foreign_storage::throw_number_of_columns_mismatch_error().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_parquet_metadata().

1575  {
1576  if (schema.numLogicalColumns() != file_metadata->num_columns()) {
1578  schema.numLogicalColumns(), file_metadata->num_columns(), file_path);
1579  }
1580 }
void throw_number_of_columns_mismatch_error(size_t num_table_cols, size_t num_file_cols, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

MaxRowGroupSizeStats foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_parquet_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1666 of file LazyParquetChunkLoader.cpp.

References validate_column_mapping_and_row_group_metadata(), and validate_number_of_columns().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1669  {
1670  validate_number_of_columns(file_metadata, file_path, schema);
1671  return validate_column_mapping_and_row_group_metadata(file_metadata, file_path, schema);
1672 }
MaxRowGroupSizeStats validate_column_mapping_and_row_group_metadata(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
void validate_number_of_columns(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_string_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1436 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1437  {
1438  return is_valid_parquet_string(parquet_column) &&
1439  omnisci_column->columnType.is_string() &&
1440  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1441  omnisci_column->columnType.get_compression() == kENCODING_DICT);
1442 }
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:510

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_time_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1387 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kENCODING_FIXED, kENCODING_NONE, and kTIME.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1388  {
1389  if (!(omnisci_column->columnType.get_type() == kTIME &&
1390  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1391  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1392  omnisci_column->columnType.get_comp_param() == 32)))) {
1393  return false;
1394  }
1395  if (parquet_column->logical_type()->is_time()) {
1396  return true;
1397  }
1398  return false;
1399 }
Definition: sqltypes.h:49
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_timestamp_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1332 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_dimension(), SQLTypeInfo::get_type(), is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_FIXED, kENCODING_NONE, and kTIMESTAMP.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1333  {
1334  if (!(omnisci_column->columnType.get_type() == kTIMESTAMP &&
1335  ((omnisci_column->columnType.get_compression() == kENCODING_NONE) ||
1336  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1337  omnisci_column->columnType.get_comp_param() == 32)))) {
1338  return false;
1339  }
1340  // check the annotated case
1341  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1342  parquet_column->logical_type().get())) {
1343  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1344  return omnisci_column->columnType.get_dimension() == 0 ||
1345  ((is_nanosecond_precision(omnisci_column) &&
1346  is_nanosecond_precision(timestamp_logical_column)) ||
1347  (is_microsecond_precision(omnisci_column) &&
1348  is_microsecond_precision(timestamp_logical_column)) ||
1349  (is_millisecond_precision(omnisci_column) &&
1350  is_millisecond_precision(timestamp_logical_column)));
1351  }
1352  if (omnisci_column->columnType.get_compression() == kENCODING_FIXED) {
1353  return omnisci_column->columnType.get_dimension() == 0;
1354  }
1355  }
1356  // check the unannotated case
1357  if (parquet_column->logical_type()->is_none() &&
1358  ((parquet_column->physical_type() == parquet::Type::INT32 &&
1359  omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1360  omnisci_column->columnType.get_comp_param() == 32) ||
1361  parquet_column->physical_type() == parquet::Type::INT64)) {
1362  return true;
1363  }
1364  return false;
1365 }
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:331
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::within_range ( int64_t  lower_bound,
int64_t  upper_bound,
int64_t  value 
)

Definition at line 54 of file LazyParquetChunkLoader.cpp.

References gpu_enabled::upper_bound().

Referenced by suggest_integral_mapping().

54  {
55  return value >= lower_bound && value <= upper_bound;
56 }
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78

+ Here is the call graph for this function:

+ Here is the caller graph for this function: