OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp} Namespace Reference

Classes

struct  MaxRowGroupSizeStats
 

Functions

bool within_range (int64_t lower_bound, int64_t upper_bound, int64_t value)
 
bool is_valid_parquet_string (const parquet::ColumnDescriptor *parquet_column)
 
bool is_valid_parquet_list_column (const parquet::ColumnDescriptor *parquet_column)
 Detect a valid list parquet column. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder_with_omnisci_type (const ColumnDescriptor *column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_decimal_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename U , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_signed_or_unsigned_integral_encoder_with_types (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
 Create a signed or unsigned integral parquet encoder using types. More...
 
template<typename V , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder_with_omnisci_type (AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const int bit_width, const bool is_signed)
 Create a integral parquet encoder using types. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_integral_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_floating_point_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_none_type_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
template<typename V , typename T , typename NullType >
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder_with_types (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
 
std::shared_ptr< ParquetEncodercreate_parquet_time_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_from_timestamp_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_date_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_string_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import, const bool is_for_detect)
 
std::shared_ptr< ParquetEncodercreate_parquet_geospatial_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan, const bool is_for_import)
 
std::shared_ptr< ParquetEncodercreate_parquet_array_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan=false, const bool is_for_import=false, const bool is_for_detect=false)
 Create a Parquet specific encoder for a Parquet to OmniSci mapping. More...
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_import (std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary, const RenderGroupAnalyzerMap *render_group_analyzer_map)
 
std::shared_ptr< ParquetEncodercreate_parquet_encoder_for_metadata_scan (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const RenderGroupAnalyzerMap *render_group_analyzer_map)
 
void validate_definition_levels (const parquet::ParquetFileReader *reader, const int row_group_index, const int column_index, const int16_t *def_levels, const int64_t num_levels, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void validate_max_repetition_and_definition_level (const ColumnDescriptor *omnisci_column_descriptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
void resize_values_buffer (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::vector< int8_t > &values)
 
bool validate_decimal_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_decimal_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_floating_point_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_floating_point_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_integral_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_integral_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool is_nanosecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_nanosecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_microsecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_microsecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool is_millisecond_precision (const ColumnDescriptor *omnisci_column)
 
bool is_millisecond_precision (const parquet::TimestampLogicalType *timestamp_logical_column)
 
bool validate_none_type_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_boolean_type_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_timestamp_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_timestamp_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_time_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_time_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_date_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_date_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_string_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
SQLTypeInfo suggest_string_mapping (const parquet::ColumnDescriptor *parquet_column)
 
bool validate_array_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
bool validate_geospatial_mapping (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 
void validate_equal_schema (const parquet::arrow::FileReader *reference_file_reader, const parquet::arrow::FileReader *new_file_reader, const std::string &reference_file_path, const std::string &new_file_path)
 
void validate_allowed_mapping (const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)
 
SQLTypeInfo suggest_column_scalar_type (const parquet::ColumnDescriptor *parquet_column)
 
void validate_number_of_columns (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
void throw_missing_metadata_error (const int row_group_index, const int column_index, const std::string &file_path)
 
void throw_row_group_larger_than_fragment_size_error (const MaxRowGroupSizeStats max_row_group_stats, const int fragment_size)
 
MaxRowGroupSizeStats validate_column_mapping_and_row_group_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
MaxRowGroupSizeStats validate_parquet_metadata (const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
 
std::list< RowGroupMetadatametadata_scan_rowgroup_interval (const std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const RowGroupInterval &row_group_interval, const ReaderPtr &reader, const ForeignTableSchema &schema)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_import (const std::map< int, Chunk_NS::Chunk > chunks, const ForeignTableSchema &schema, const ReaderPtr &reader, const std::map< int, StringDictionary * > column_dictionaries, const int64_t num_rows, const RenderGroupAnalyzerMap *render_group_analyzer_map)
 
std::map< int, std::shared_ptr
< ParquetEncoder > > 
populate_encoder_map_for_metadata_scan (const Interval< ColumnType > &column_interval, const ForeignTableSchema &schema, const ReaderPtr &reader, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool do_metadata_stats_validation)
 

Function Documentation

std::shared_ptr< ParquetEncoder > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const bool  is_metadata_scan,
const bool  is_for_import,
const bool  is_for_detect 
)

Definition at line 1018 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_encoder(), foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::is_array(), SQLTypeInfo::is_fixlen_array(), and is_valid_parquet_list_column().

Referenced by create_parquet_encoder().

1026  {
1027  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column);
1028  if (!is_valid_parquet_list || !omnisci_column->columnType.is_array()) {
1029  return {};
1030  }
1031  std::unique_ptr<ColumnDescriptor> omnisci_column_sub_type_column =
1032  get_sub_type_column_descriptor(omnisci_column);
1033  auto encoder = create_parquet_encoder(omnisci_column_sub_type_column.get(),
1034  parquet_column,
1035  chunks,
1036  string_dictionary,
1037  chunk_metadata,
1038  nullptr,
1039  is_metadata_scan,
1040  is_for_import,
1041  is_for_detect);
1042  CHECK(encoder.get());
1043  auto scalar_encoder = std::dynamic_pointer_cast<ParquetScalarEncoder>(encoder);
1044  CHECK(scalar_encoder);
1045  if (!is_for_import) {
1046  if (!is_for_detect) {
1047  if (omnisci_column->columnType.is_fixlen_array()) {
1048  encoder = std::make_shared<ParquetFixedLengthArrayEncoder>(
1049  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1050  scalar_encoder,
1051  omnisci_column);
1052  } else {
1053  encoder = std::make_shared<ParquetVariableLengthArrayEncoder>(
1054  is_metadata_scan ? nullptr : chunks.begin()->getBuffer(),
1055  is_metadata_scan ? nullptr : chunks.begin()->getIndexBuf(),
1056  scalar_encoder,
1057  omnisci_column);
1058  }
1059  } else { // is_for_detect
1060  encoder = std::make_shared<ParquetArrayDetectEncoder>(
1061  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1062  }
1063  } else { // is_for_import
1064  encoder = std::make_shared<ParquetArrayImportEncoder>(
1065  chunks.begin()->getBuffer(), scalar_encoder, omnisci_column);
1066  }
1067  return encoder;
1068 }
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
bool is_fixlen_array() const
Definition: sqltypes.h:590
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan=false, const bool is_for_import=false, const bool is_for_detect=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.
bool is_array() const
Definition: sqltypes.h:588

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 736 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, kENCODING_DATE_IN_DAYS, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

740  {
741  auto column_type = omnisci_column->columnType;
742  if (parquet_column->logical_type()->is_date() && column_type.is_date()) {
743  if (column_type.get_compression() == kENCODING_DATE_IN_DAYS) {
744  if (is_metadata_scan_or_for_import) {
745  if (column_type.get_comp_param() ==
746  0) { // DATE ENCODING FIXED (32) uses comp param 0
747  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int32_t>>(
748  buffer);
749  } else if (column_type.get_comp_param() == 16) {
750  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int16_t>>(
751  buffer);
752  } else {
753  UNREACHABLE();
754  }
755  } else {
756  if (column_type.get_comp_param() ==
757  0) { // DATE ENCODING FIXED (32) uses comp param 0
758  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t>>(
759  buffer, omnisci_column, parquet_column);
760  } else if (column_type.get_comp_param() == 16) {
761  return std::make_shared<ParquetFixedLengthEncoder<int16_t, int32_t>>(
762  buffer, omnisci_column, parquet_column);
763  } else {
764  UNREACHABLE();
765  }
766  }
767  } else if (column_type.get_compression() == kENCODING_NONE) { // for array types
768  return std::make_shared<ParquetDateInSecondsEncoder</*NullType=*/int64_t>>(
769  buffer, omnisci_column, parquet_column);
770  } else {
771  UNREACHABLE();
772  }
773  }
774  return {};
775 }
#define UNREACHABLE()
Definition: Logger.h:337
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 693 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, create_parquet_date_from_timestamp_encoder_with_types(), kENCODING_DATE_IN_DAYS, and UNREACHABLE.

Referenced by create_parquet_encoder().

697  {
698  auto column_type = omnisci_column->columnType;
699  if (parquet_column->logical_type()->is_timestamp() && column_type.is_date()) {
700  CHECK(column_type.get_compression() == kENCODING_DATE_IN_DAYS);
701  if (is_metadata_scan_or_for_import) {
702  if (column_type.get_comp_param() ==
703  0) { // DATE ENCODING FIXED (32) uses comp param 0
705  int64_t,
706  int32_t>(
707  omnisci_column, parquet_column, buffer, true);
708  } else if (column_type.get_comp_param() == 16) {
710  int64_t,
711  int16_t>(
712  omnisci_column, parquet_column, buffer, true);
713  } else {
714  UNREACHABLE();
715  }
716  } else {
717  if (column_type.get_comp_param() ==
718  0) { // DATE ENCODING FIXED (32) uses comp param 0
720  int64_t,
721  int32_t>(
722  omnisci_column, parquet_column, buffer, false);
723  } else if (column_type.get_comp_param() == 16) {
725  int64_t,
726  int16_t>(
727  omnisci_column, parquet_column, buffer, false);
728  } else {
729  UNREACHABLE();
730  }
731  }
732  }
733  return {};
734 }
#define UNREACHABLE()
Definition: Logger.h:337
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder_with_types(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_date_from_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 508 of file LazyParquetChunkLoader.cpp.

References heavydb.dtypes::T, and UNREACHABLE.

Referenced by create_parquet_date_from_timestamp_encoder().

512  {
513  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
514  parquet_column->logical_type().get())) {
515  switch (timestamp_logical_type->time_unit()) {
516  case parquet::LogicalType::TimeUnit::MILLIS:
517  if (is_metadata_scan_or_for_import) {
518  return std::make_shared<
519  ParquetDateInSecondsFromTimestampEncoder<V, T, 1000L, NullType>>(
520  buffer, omnisci_column, parquet_column);
521  }
522  return std::make_shared<
523  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L, NullType>>(
524  buffer, omnisci_column, parquet_column);
525  case parquet::LogicalType::TimeUnit::MICROS:
526  if (is_metadata_scan_or_for_import) {
527  return std::make_shared<
528  ParquetDateInSecondsFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
529  buffer, omnisci_column, parquet_column);
530  }
531  return std::make_shared<
532  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
533  buffer, omnisci_column, parquet_column);
534  case parquet::LogicalType::TimeUnit::NANOS:
535  if (is_metadata_scan_or_for_import) {
536  return std::make_shared<
538  T,
539  1000L * 1000L * 1000L,
540  NullType>>(
541  buffer, omnisci_column, parquet_column);
542  }
543  return std::make_shared<
544  ParquetDateInDaysFromTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
545  buffer, omnisci_column, parquet_column);
546  default:
547  UNREACHABLE();
548  }
549  } else {
550  UNREACHABLE();
551  }
552  return {};
553 }
ParquetTimestampEncoder< V, T, conversion_denominator, NullType > ParquetDateInSecondsFromTimestampEncoder
#define UNREACHABLE()
Definition: Logger.h:337

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 171 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

175  {
176  if (parquet_column->logical_type()->is_decimal()) {
177  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
178  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int64_t>(
179  omnisci_column, parquet_column, buffer);
180  }
181  CHECK(omnisci_column->columnType.get_compression() == kENCODING_FIXED);
182  if (is_metadata_scan_or_for_import) {
183  switch (omnisci_column->columnType.get_comp_param()) {
184  case 16:
185  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int16_t>(
186  omnisci_column, parquet_column, buffer);
187  case 32:
188  return create_parquet_decimal_encoder_with_omnisci_type<int64_t, int32_t>(
189  omnisci_column, parquet_column, buffer);
190  default:
191  UNREACHABLE();
192  }
193  } else {
194  switch (omnisci_column->columnType.get_comp_param()) {
195  case 16:
196  return create_parquet_decimal_encoder_with_omnisci_type<int16_t, int16_t>(
197  omnisci_column, parquet_column, buffer);
198  case 32:
199  return create_parquet_decimal_encoder_with_omnisci_type<int32_t, int32_t>(
200  omnisci_column, parquet_column, buffer);
201  default:
202  UNREACHABLE();
203  }
204  }
205  }
206  return {};
207 }
#define UNREACHABLE()
Definition: Logger.h:337
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:392
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_decimal_encoder_with_omnisci_type ( const ColumnDescriptor column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor,
AbstractBuffer buffer 
)

Definition at line 147 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

150  {
151  switch (parquet_column_descriptor->physical_type()) {
152  case parquet::Type::INT32:
153  return std::make_shared<ParquetDecimalEncoder<V, int32_t, NullType>>(
154  buffer, column_descriptor, parquet_column_descriptor);
155  case parquet::Type::INT64:
156  return std::make_shared<ParquetDecimalEncoder<V, int64_t, NullType>>(
157  buffer, column_descriptor, parquet_column_descriptor);
158  case parquet::Type::FIXED_LEN_BYTE_ARRAY:
159  return std::make_shared<
160  ParquetDecimalEncoder<V, parquet::FixedLenByteArray, NullType>>(
161  buffer, column_descriptor, parquet_column_descriptor);
162  case parquet::Type::BYTE_ARRAY:
163  return std::make_shared<ParquetDecimalEncoder<V, parquet::ByteArray, NullType>>(
164  buffer, column_descriptor, parquet_column_descriptor);
165  default:
166  UNREACHABLE();
167  }
168  return {};
169 }
#define UNREACHABLE()
Definition: Logger.h:337
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const RenderGroupAnalyzerMap render_group_analyzer_map,
const bool  is_metadata_scan = false,
const bool  is_for_import = false,
const bool  is_for_detect = false 
)

Create a Parquet specific encoder for a Parquet to OmniSci mapping.

Parameters
omnisci_column- the descriptor of OmniSci column
parquet_column- the descriptor of Parquet column
chunks- list of chunks to populate (the case of more than one chunk happens only if a logical column expands to multiple physical columns)
string_dictionary- string dictionary used in encoding for string dictionary encoded columns
chunk_metadata- similar to the list of chunks, a list of chunk metadata that is populated
is_metadata_scan- a flag indicating if the encoders created should be for a metadata scan
is_for_import- a flag indicating if the encoders created should be for import
Returns
An appropriate Parquet encoder for the use case defined by the Parquet to OmniSci mapping.

Notes:

  • In the case of a metadata scan, the type of the encoder created may significantly change (for example in bit width.) This is because it is common for OmniSci to store metadata in a different format altogether than the data itself (see for example FixedLengthEncoder.)
  • This function and the function isColumnMappingSupported work in conjunction with each other. For example, once a mapping is known to be allowed (since isColumnMappingSupported returned true) this function does not have to check many corner cases exhaustively as it would be redundant with what was checked in isColumnMappingSupported.

Definition at line 902 of file LazyParquetChunkLoader.cpp.

References CHECK, create_parquet_array_encoder(), create_parquet_date_encoder(), create_parquet_date_from_timestamp_encoder(), create_parquet_decimal_encoder(), create_parquet_floating_point_encoder(), create_parquet_geospatial_encoder(), create_parquet_integral_encoder(), create_parquet_none_type_encoder(), create_parquet_string_encoder(), create_parquet_time_encoder(), create_parquet_timestamp_encoder(), and UNREACHABLE.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), create_parquet_array_encoder(), create_parquet_encoder_for_import(), and create_parquet_encoder_for_metadata_scan().

911  {
912  CHECK(!(is_metadata_scan && is_for_import));
913  auto buffer = chunks.empty() ? nullptr : chunks.begin()->getBuffer();
914  if (auto encoder = create_parquet_geospatial_encoder(omnisci_column,
915  parquet_column,
916  chunks,
917  chunk_metadata,
918  render_group_analyzer_map,
919  is_metadata_scan,
920  is_for_import)) {
921  return encoder;
922  }
923  if (auto encoder = create_parquet_array_encoder(omnisci_column,
924  parquet_column,
925  chunks,
926  string_dictionary,
927  chunk_metadata,
928  is_metadata_scan,
929  is_for_import,
930  is_for_detect)) {
931  return encoder;
932  }
933  if (auto encoder = create_parquet_decimal_encoder(
934  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
935  return encoder;
936  }
937  if (auto encoder = create_parquet_integral_encoder(
938  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
939  return encoder;
940  }
941  if (auto encoder =
942  create_parquet_floating_point_encoder(omnisci_column, parquet_column, buffer)) {
943  return encoder;
944  }
945  if (auto encoder = create_parquet_timestamp_encoder(
946  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
947  return encoder;
948  }
949  if (auto encoder =
950  create_parquet_none_type_encoder(omnisci_column, parquet_column, buffer)) {
951  return encoder;
952  }
953  if (auto encoder = create_parquet_time_encoder(
954  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
955  return encoder;
956  }
958  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
959  return encoder;
960  }
961  if (auto encoder = create_parquet_date_encoder(
962  omnisci_column, parquet_column, buffer, is_metadata_scan || is_for_import)) {
963  return encoder;
964  }
965  if (auto encoder = create_parquet_string_encoder(
966  omnisci_column,
967  parquet_column,
968  chunks.empty() ? Chunk_NS::Chunk{} : *chunks.begin(),
969  string_dictionary,
970  chunk_metadata,
971  is_for_import,
972  is_for_detect)) {
973  return encoder;
974  }
975  UNREACHABLE();
976  return {};
977 }
std::shared_ptr< ParquetEncoder > create_parquet_geospatial_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan, const bool is_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_array_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool is_metadata_scan, const bool is_for_import, const bool is_for_detect)
#define UNREACHABLE()
Definition: Logger.h:337
std::shared_ptr< ParquetEncoder > create_parquet_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_none_type_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
std::shared_ptr< ParquetEncoder > create_parquet_time_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_date_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_string_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const Chunk_NS::Chunk &chunk, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, bool is_for_import, const bool is_for_detect)
std::shared_ptr< ParquetEncoder > create_parquet_floating_point_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer)
#define CHECK(condition)
Definition: Logger.h:291
std::shared_ptr< ParquetEncoder > create_parquet_date_from_timestamp_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_decimal_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)
std::shared_ptr< ParquetEncoder > create_parquet_integral_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, AbstractBuffer *buffer, const bool is_metadata_scan_or_for_import)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_import ( std::list< Chunk_NS::Chunk > &  chunks,
const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
StringDictionary string_dictionary,
const RenderGroupAnalyzerMap render_group_analyzer_map 
)

Intended to be used for the import case.

Definition at line 982 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_import().

987  {
988  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
989  return create_parquet_encoder(omnisci_column,
990  parquet_column,
991  chunks,
992  string_dictionary,
993  chunk_metadata,
994  render_group_analyzer_map,
995  false,
996  true);
997 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan=false, const bool is_for_import=false, const bool is_for_detect=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder_for_metadata_scan ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const RenderGroupAnalyzerMap render_group_analyzer_map 
)

Intended to be used only with metadata scan. Creates an incomplete encoder capable of updating metadata.

Definition at line 1003 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder().

Referenced by populate_encoder_map_for_metadata_scan().

1006  {
1007  std::list<Chunk_NS::Chunk> chunks;
1008  std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
1009  return create_parquet_encoder(omnisci_column,
1010  parquet_column,
1011  chunks,
1012  nullptr,
1013  chunk_metadata,
1014  render_group_analyzer_map,
1015  true);
1016 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map, const bool is_metadata_scan=false, const bool is_for_import=false, const bool is_for_detect=false)
Create a Parquet specific encoder for a Parquet to OmniSci mapping.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_floating_point_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 427 of file LazyParquetChunkLoader.cpp.

References CHECK, CHECK_EQ, ColumnDescriptor::columnType, kDOUBLE, kENCODING_NONE, kFLOAT, and UNREACHABLE.

Referenced by create_parquet_encoder().

430  {
431  auto column_type = omnisci_column->columnType;
432  if (!column_type.is_fp()) {
433  return {};
434  }
435  CHECK_EQ(column_type.get_compression(), kENCODING_NONE);
436  switch (column_type.get_type()) {
437  case kFLOAT:
438  switch (parquet_column->physical_type()) {
439  case parquet::Type::FLOAT:
440  return std::make_shared<ParquetFixedLengthEncoder<float, float>>(
441  buffer, omnisci_column, parquet_column);
442  case parquet::Type::DOUBLE:
443  return std::make_shared<ParquetFixedLengthEncoder<float, double>>(
444  buffer, omnisci_column, parquet_column);
445  default:
446  UNREACHABLE();
447  }
448  case kDOUBLE:
449  CHECK(parquet_column->physical_type() == parquet::Type::DOUBLE);
450  return std::make_shared<ParquetFixedLengthEncoder<double, double>>(
451  buffer, omnisci_column, parquet_column);
452  default:
453  UNREACHABLE();
454  }
455  return {};
456 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define UNREACHABLE()
Definition: Logger.h:337
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_geospatial_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::list< Chunk_NS::Chunk > &  chunks,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
const RenderGroupAnalyzerMap render_group_analyzer_map,
const bool  is_metadata_scan,
const bool  is_for_import 
)

Definition at line 830 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and is_valid_parquet_string().

Referenced by create_parquet_encoder().

837  {
838  auto column_type = omnisci_column->columnType;
839  if (!is_valid_parquet_string(parquet_column) || !column_type.is_geometry()) {
840  return {};
841  }
842  if (is_for_import) {
843  return std::make_shared<ParquetGeospatialImportEncoder>(chunks); // no RGAMap
844  }
845  if (is_metadata_scan) {
846  return std::make_shared<ParquetGeospatialEncoder>(render_group_analyzer_map);
847  }
848  for (auto chunks_iter = chunks.begin(); chunks_iter != chunks.end(); ++chunks_iter) {
849  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
850  auto& chunk_metadata_ptr = chunk_metadata.back();
851  chunk_metadata_ptr->sqlType = chunks_iter->getColumnDesc()->columnType;
852  }
853  return std::make_shared<ParquetGeospatialEncoder>(
854  parquet_column, chunks, chunk_metadata, render_group_analyzer_map);
855 }
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 297 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, and UNREACHABLE.

Referenced by create_parquet_encoder().

301  {
302  auto column_type = omnisci_column->columnType;
303  auto physical_type = parquet_column->physical_type();
304 
305  int bit_width = -1;
306  int is_signed = false;
307  // handle the integral case with no Parquet annotation
308  if (parquet_column->logical_type()->is_none() && column_type.is_integer()) {
309  if (physical_type == parquet::Type::INT32) {
310  bit_width = 32;
311  } else if (physical_type == parquet::Type::INT64) {
312  bit_width = 64;
313  } else {
314  UNREACHABLE();
315  }
316  is_signed = true;
317  }
318  // handle the integral case with Parquet annotation
319  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
320  parquet_column->logical_type().get())) {
321  bit_width = int_logical_column->bit_width();
322  is_signed = int_logical_column->is_signed();
323  }
324 
325  if (bit_width == -1) { // no valid logical type (with or without annotation) found
326  return {};
327  }
328 
329  const size_t omnisci_data_type_byte_size = column_type.get_size();
330  const size_t parquet_data_type_byte_size = parquet::GetTypeByteSize(physical_type);
331 
332  switch (omnisci_data_type_byte_size) {
333  case 8:
334  CHECK(column_type.get_compression() == kENCODING_NONE);
335  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int64_t>(
336  buffer,
337  omnisci_data_type_byte_size,
338  parquet_data_type_byte_size,
339  bit_width,
340  is_signed);
341  case 4:
342  if (is_metadata_scan_or_for_import && column_type.get_type() == kBIGINT) {
343  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int32_t>(
344  buffer,
345  omnisci_data_type_byte_size,
346  parquet_data_type_byte_size,
347  bit_width,
348  is_signed);
349  }
350  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int32_t>(
351  buffer,
352  omnisci_data_type_byte_size,
353  parquet_data_type_byte_size,
354  bit_width,
355  is_signed);
356  case 2:
357  if (is_metadata_scan_or_for_import) {
358  switch (column_type.get_type()) {
359  case kBIGINT:
360  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int16_t>(
361  buffer,
362  omnisci_data_type_byte_size,
363  parquet_data_type_byte_size,
364  bit_width,
365  is_signed);
366  case kINT:
367  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int16_t>(
368  buffer,
369  omnisci_data_type_byte_size,
370  parquet_data_type_byte_size,
371  bit_width,
372  is_signed);
373  case kSMALLINT:
374  break;
375  default:
376  UNREACHABLE();
377  }
378  }
379  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int16_t>(
380  buffer,
381  omnisci_data_type_byte_size,
382  parquet_data_type_byte_size,
383  bit_width,
384  is_signed);
385  case 1:
386  if (is_metadata_scan_or_for_import) {
387  switch (column_type.get_type()) {
388  case kBIGINT:
389  return create_parquet_integral_encoder_with_omnisci_type<int64_t, int8_t>(
390  buffer,
391  omnisci_data_type_byte_size,
392  parquet_data_type_byte_size,
393  bit_width,
394  is_signed);
395  case kINT:
396  return create_parquet_integral_encoder_with_omnisci_type<int32_t, int8_t>(
397  buffer,
398  omnisci_data_type_byte_size,
399  parquet_data_type_byte_size,
400  bit_width,
401  is_signed);
402  case kSMALLINT:
403  return create_parquet_integral_encoder_with_omnisci_type<int16_t, int8_t>(
404  buffer,
405  omnisci_data_type_byte_size,
406  parquet_data_type_byte_size,
407  bit_width,
408  is_signed);
409  case kTINYINT:
410  break;
411  default:
412  UNREACHABLE();
413  }
414  }
415  return create_parquet_integral_encoder_with_omnisci_type<int8_t, int8_t>(
416  buffer,
417  omnisci_data_type_byte_size,
418  parquet_data_type_byte_size,
419  bit_width,
420  is_signed);
421  default:
422  UNREACHABLE();
423  }
424  return {};
425 }
#define UNREACHABLE()
Definition: Logger.h:337
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqltypes.h:62
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_integral_encoder_with_omnisci_type ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const int  bit_width,
const bool  is_signed 
)

Create a integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
bit_width- bit width specified for the Parquet column
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated type V and NullType.

Note, this function determines the appropriate bit depth integral encoder to create, while create_parquet_signed_or_unsigned_integral_encoder_with_types determines whether to create a signed or unsigned integral encoder.

Definition at line 260 of file LazyParquetChunkLoader.cpp.

References create_parquet_signed_or_unsigned_integral_encoder_with_types(), and UNREACHABLE.

265  {
266  switch (bit_width) {
267  case 8:
269  int32_t,
270  uint8_t,
271  NullType>(
272  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
273  case 16:
275  int32_t,
276  uint16_t,
277  NullType>(
278  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
279  case 32:
281  int32_t,
282  uint32_t,
283  NullType>(
284  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
285  case 64:
287  int64_t,
288  uint64_t,
289  NullType>(
290  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size, is_signed);
291  default:
292  UNREACHABLE();
293  }
294  return {};
295 }
std::shared_ptr< ParquetEncoder > create_parquet_signed_or_unsigned_integral_encoder_with_types(AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size, const bool is_signed)
Create a signed or unsigned integral parquet encoder using types.
#define UNREACHABLE()
Definition: Logger.h:337

+ Here is the call graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_none_type_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 458 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_string(), kBOOLEAN, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

461  {
462  auto column_type = omnisci_column->columnType;
463  if (parquet_column->logical_type()->is_none() &&
464  !omnisci_column->columnType.is_string()) { // boolean
465  if (column_type.get_compression() == kENCODING_NONE) {
466  switch (column_type.get_type()) {
467  case kBOOLEAN:
468  return std::make_shared<ParquetFixedLengthEncoder<int8_t, bool>>(
469  buffer, omnisci_column, parquet_column);
470  default:
471  UNREACHABLE();
472  }
473  } else {
474  UNREACHABLE();
475  }
476  }
477  return {};
478 }
#define UNREACHABLE()
Definition: Logger.h:337
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:580

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename U , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_signed_or_unsigned_integral_encoder_with_types ( AbstractBuffer buffer,
const size_t  omnisci_data_type_byte_size,
const size_t  parquet_data_type_byte_size,
const bool  is_signed 
)

Create a signed or unsigned integral parquet encoder using types.

Parameters
buffer- buffer used within the encoder
omnisci_data_type_byte_size- size in number of bytes of OmniSci type
parquet_data_type_byte_size- size in number of bytes of Parquet physical type
is_signed- flag indicating if Parquet column is signed
Returns
a std::shared_ptr to an integral encoder

See the documentation for ParquetFixedLengthEncoder and ParquetUnsignedFixedLengthEncoder for a description of the semantics of the templated types V, T, U, and NullType.

Definition at line 225 of file LazyParquetChunkLoader.cpp.

References CHECK.

Referenced by create_parquet_integral_encoder_with_omnisci_type().

229  {
230  CHECK(sizeof(NullType) == omnisci_data_type_byte_size);
231  if (is_signed) {
232  return std::make_shared<ParquetFixedLengthEncoder<V, T, NullType>>(
233  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
234  } else {
235  return std::make_shared<ParquetUnsignedFixedLengthEncoder<V, T, U, NullType>>(
236  buffer, omnisci_data_type_byte_size, parquet_data_type_byte_size);
237  }
238 }
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_string_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
const Chunk_NS::Chunk chunk,
StringDictionary string_dictionary,
std::list< std::unique_ptr< ChunkMetadata >> &  chunk_metadata,
bool  is_for_import,
const bool  is_for_detect 
)

Definition at line 777 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, Chunk_NS::Chunk::getBuffer(), Chunk_NS::Chunk::getIndexBuf(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

784  {
785  auto column_type = omnisci_column->columnType;
786  if (!is_valid_parquet_string(parquet_column) ||
787  !omnisci_column->columnType.is_string()) {
788  return {};
789  }
790  if (column_type.get_compression() == kENCODING_NONE) {
791  if (is_for_import) {
792  return std::make_shared<ParquetStringImportEncoder>(chunk.getBuffer());
793  } else {
794  return std::make_shared<ParquetStringNoneEncoder>(chunk.getBuffer(),
795  chunk.getIndexBuf());
796  }
797  } else if (column_type.get_compression() == kENCODING_DICT) {
798  if (!is_for_detect) { // non-detect use case
799  chunk_metadata.emplace_back(std::make_unique<ChunkMetadata>());
800  std::unique_ptr<ChunkMetadata>& logical_chunk_metadata = chunk_metadata.back();
801  logical_chunk_metadata->sqlType = omnisci_column->columnType;
802  switch (column_type.get_size()) {
803  case 1:
804  return std::make_shared<ParquetStringEncoder<uint8_t>>(
805  chunk.getBuffer(),
806  string_dictionary,
807  is_for_import ? nullptr : logical_chunk_metadata.get());
808  case 2:
809  return std::make_shared<ParquetStringEncoder<uint16_t>>(
810  chunk.getBuffer(),
811  string_dictionary,
812  is_for_import ? nullptr : logical_chunk_metadata.get());
813  case 4:
814  return std::make_shared<ParquetStringEncoder<int32_t>>(
815  chunk.getBuffer(),
816  string_dictionary,
817  is_for_import ? nullptr : logical_chunk_metadata.get());
818  default:
819  UNREACHABLE();
820  }
821  } else { // detect use-case
822  return std::make_shared<ParquetDetectStringEncoder>(chunk.getBuffer());
823  }
824  } else {
825  UNREACHABLE();
826  }
827  return {};
828 }
AbstractBuffer * getIndexBuf() const
Definition: Chunk.h:148
#define UNREACHABLE()
Definition: Logger.h:337
AbstractBuffer * getBuffer() const
Definition: Chunk.h:146
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:580

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 640 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

644  {
645  auto column_type = omnisci_column->columnType;
646  if (auto time_logical_column = dynamic_cast<const parquet::TimeLogicalType*>(
647  parquet_column->logical_type().get())) {
648  if (column_type.get_compression() == kENCODING_NONE) {
649  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
650  return create_parquet_time_encoder_with_types<int64_t, int32_t, int64_t>(
651  omnisci_column, parquet_column, buffer);
652  } else {
653  return create_parquet_time_encoder_with_types<int64_t, int64_t, int64_t>(
654  omnisci_column, parquet_column, buffer);
655  }
656  } else if (column_type.get_compression() == kENCODING_FIXED) {
657  if (is_metadata_scan_or_for_import) {
658  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
659  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
660  return create_parquet_time_encoder_with_types<int64_t, int32_t, int32_t>(
661  omnisci_column, parquet_column, buffer);
662  } else {
663  CHECK(time_logical_column->time_unit() ==
664  parquet::LogicalType::TimeUnit::MICROS ||
665  time_logical_column->time_unit() ==
666  parquet::LogicalType::TimeUnit::NANOS);
667  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
668  return create_parquet_time_encoder_with_types<int64_t, int64_t, int32_t>(
669  omnisci_column, parquet_column, buffer);
670  }
671  } else {
672  if (time_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS) {
673  CHECK(parquet_column->physical_type() == parquet::Type::INT32);
674  return create_parquet_time_encoder_with_types<int32_t, int32_t, int32_t>(
675  omnisci_column, parquet_column, buffer);
676  } else {
677  CHECK(time_logical_column->time_unit() ==
678  parquet::LogicalType::TimeUnit::MICROS ||
679  time_logical_column->time_unit() ==
680  parquet::LogicalType::TimeUnit::NANOS);
681  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
682  return create_parquet_time_encoder_with_types<int32_t, int64_t, int32_t>(
683  omnisci_column, parquet_column, buffer);
684  }
685  }
686  } else {
687  UNREACHABLE();
688  }
689  }
690  return {};
691 }
#define UNREACHABLE()
Definition: Logger.h:337
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_time_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 614 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

617  {
618  if (auto time_logical_type = dynamic_cast<const parquet::TimeLogicalType*>(
619  parquet_column->logical_type().get())) {
620  switch (time_logical_type->time_unit()) {
621  case parquet::LogicalType::TimeUnit::MILLIS:
622  return std::make_shared<ParquetTimeEncoder<V, T, 1000L, NullType>>(
623  buffer, omnisci_column, parquet_column);
624  case parquet::LogicalType::TimeUnit::MICROS:
625  return std::make_shared<ParquetTimeEncoder<V, T, 1000L * 1000L, NullType>>(
626  buffer, omnisci_column, parquet_column);
627  case parquet::LogicalType::TimeUnit::NANOS:
628  return std::make_shared<
629  ParquetTimeEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
630  buffer, omnisci_column, parquet_column);
631  default:
632  UNREACHABLE();
633  }
634  } else {
635  UNREACHABLE();
636  }
637  return {};
638 }
#define UNREACHABLE()
Definition: Logger.h:337
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer,
const bool  is_metadata_scan_or_for_import 
)

Definition at line 555 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_precision(), kENCODING_FIXED, kENCODING_NONE, and UNREACHABLE.

Referenced by create_parquet_encoder().

559  {
560  auto column_type = omnisci_column->columnType;
561  auto precision = column_type.get_precision();
562  if (parquet_column->logical_type()->is_timestamp()) {
563  if (column_type.get_compression() == kENCODING_NONE) {
564  if (precision == 0) {
565  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int64_t>(
566  omnisci_column, parquet_column, buffer);
567  } else {
568  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
569  buffer, omnisci_column, parquet_column);
570  }
571  } else if (column_type.get_compression() == kENCODING_FIXED) {
572  CHECK(column_type.get_comp_param() == 32);
573  if (is_metadata_scan_or_for_import) {
574  return create_parquet_timestamp_encoder_with_types<int64_t, int64_t, int32_t>(
575  omnisci_column, parquet_column, buffer);
576  } else {
577  return create_parquet_timestamp_encoder_with_types<int32_t, int64_t, int32_t>(
578  omnisci_column, parquet_column, buffer);
579  }
580  }
581  } else if (parquet_column->logical_type()->is_none() && column_type.is_timestamp()) {
582  if (parquet_column->physical_type() == parquet::Type::INT32) {
583  CHECK(column_type.get_compression() == kENCODING_FIXED &&
584  column_type.get_comp_param() == 32);
585  if (is_metadata_scan_or_for_import) {
586  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int32_t, int32_t>>(
587  buffer, omnisci_column, parquet_column);
588  } else {
589  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int32_t, int32_t>>(
590  buffer, omnisci_column, parquet_column);
591  }
592  } else if (parquet_column->physical_type() == parquet::Type::INT64) {
593  if (column_type.get_compression() == kENCODING_NONE) {
594  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int64_t>>(
595  buffer, omnisci_column, parquet_column);
596  } else if (column_type.get_compression() == kENCODING_FIXED) {
597  CHECK(column_type.get_comp_param() == 32);
598  if (is_metadata_scan_or_for_import) {
599  return std::make_shared<ParquetFixedLengthEncoder<int64_t, int64_t, int32_t>>(
600  buffer, omnisci_column, parquet_column);
601  } else {
602  return std::make_shared<ParquetFixedLengthEncoder<int32_t, int64_t, int32_t>>(
603  buffer, omnisci_column, parquet_column);
604  }
605  }
606  } else {
607  UNREACHABLE();
608  }
609  }
610  return {};
611 }
#define UNREACHABLE()
Definition: Logger.h:337
int get_precision() const
Definition: sqltypes.h:384
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , typename T , typename NullType >
std::shared_ptr<ParquetEncoder> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_timestamp_encoder_with_types ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
AbstractBuffer buffer 
)

Definition at line 481 of file LazyParquetChunkLoader.cpp.

References UNREACHABLE.

484  {
485  if (auto timestamp_logical_type = dynamic_cast<const parquet::TimestampLogicalType*>(
486  parquet_column->logical_type().get())) {
487  switch (timestamp_logical_type->time_unit()) {
488  case parquet::LogicalType::TimeUnit::MILLIS:
489  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L, NullType>>(
490  buffer, omnisci_column, parquet_column);
491  case parquet::LogicalType::TimeUnit::MICROS:
492  return std::make_shared<ParquetTimestampEncoder<V, T, 1000L * 1000L, NullType>>(
493  buffer, omnisci_column, parquet_column);
494  case parquet::LogicalType::TimeUnit::NANOS:
495  return std::make_shared<
496  ParquetTimestampEncoder<V, T, 1000L * 1000L * 1000L, NullType>>(
497  buffer, omnisci_column, parquet_column);
498  default:
499  UNREACHABLE();
500  }
501  } else {
502  UNREACHABLE();
503  }
504  return {};
505 }
#define UNREACHABLE()
Definition: Logger.h:337
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1298 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1298  {
1299  return omnisci_column->columnType.get_dimension() == 6;
1300 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:383
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_microsecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1302 of file LazyParquetChunkLoader.cpp.

1303  {
1304  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MICROS;
1305 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1307 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1307  {
1308  return omnisci_column->columnType.get_dimension() == 3;
1309 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:383
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_millisecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1311 of file LazyParquetChunkLoader.cpp.

1312  {
1313  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::MILLIS;
1314 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const ColumnDescriptor omnisci_column)

Definition at line 1289 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, and SQLTypeInfo::get_dimension().

Referenced by suggest_timestamp_mapping(), and validate_timestamp_mapping().

1289  {
1290  return omnisci_column->columnType.get_dimension() == 9;
1291 }
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:383
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_nanosecond_precision ( const parquet::TimestampLogicalType *  timestamp_logical_column)

Definition at line 1293 of file LazyParquetChunkLoader.cpp.

1294  {
1295  return timestamp_logical_column->time_unit() == parquet::LogicalType::TimeUnit::NANOS;
1296 }
bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_list_column ( const parquet::ColumnDescriptor *  parquet_column)

Detect a valid list parquet column.

Parameters
parquet_column- the parquet column descriptor of the column to detect
Returns
true if it is a valid parquet list column

Note: the notion of a valid parquet list column is adapted from the parquet schema specification for logical type definitions:

<list-repetition> group <name> (LIST) { repeated group list { <element-repetition> <element-type> element; } }

Testing has shown that there are small deviations from this specification in at least one library– pyarrow– where the innermost schema node is named "item" as opposed to "element".

The following is also true of the schema definition.

  • The outer-most level must be a group annotated with LIST that contains a single field named list. The repetition of this level must be either optional or required and determines whether the list is nullable.
  • The middle level, named list, must be a repeated group with a single field named element.
  • The element field encodes the list's element type and repetition. Element repetition must be required or optional.

FSI further restricts lists to be defined only at the top level, meaning directly below the root schema node.

Definition at line 101 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_array_encoder(), foreign_storage::LazyParquetChunkLoader::suggestColumnMapping(), validate_array_mapping(), validate_column_mapping_and_row_group_metadata(), validate_definition_levels(), and validate_max_repetition_and_definition_level().

101  {
102  const parquet::schema::Node* node = parquet_column->schema_node().get();
103  if ((node->name() != "element" && node->name() != "item") ||
104  !(node->is_required() ||
105  node->is_optional())) { // ensure first innermost node is named "element"
106  // which is required by the parquet specification;
107  // however testing shows that pyarrow generates this
108  // column with the name of "item"
109  // this field must be either required or optional
110  return false;
111  }
112  node = node->parent();
113  if (!node) { // required nested structure
114  return false;
115  }
116  if (node->name() != "list" || !node->is_repeated() ||
117  !node->is_group()) { // ensure second innermost node is named "list" which is
118  // a repeated group; this is
119  // required by the parquet specification
120  return false;
121  }
122  node = node->parent();
123  if (!node) { // required nested structure
124  return false;
125  }
126  if (!node->logical_type()->is_list() ||
127  !(node->is_optional() ||
128  node->is_required())) { // ensure third outermost node has logical type LIST
129  // which is either optional or required; this is required
130  // by the parquet specification
131  return false;
132  }
133  node =
134  node->parent(); // this must now be the root node of schema which is required by
135  // FSI (lists can not be embedded into a deeper nested structure)
136  if (!node) { // required nested structure
137  return false;
138  }
139  node = node->parent();
140  if (node) { // implies the previous node was not the root node
141  return false;
142  }
143  return true;
144 }

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_string ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 59 of file LazyParquetChunkLoader.cpp.

Referenced by create_parquet_geospatial_encoder(), create_parquet_string_encoder(), suggest_column_scalar_type(), suggest_string_mapping(), validate_geospatial_mapping(), and validate_string_mapping().

59  {
60  return (parquet_column->logical_type()->is_none() &&
61  parquet_column->physical_type() == parquet::Type::BYTE_ARRAY) ||
62  parquet_column->logical_type()->is_string();
63 }

+ Here is the caller graph for this function:

std::list<RowGroupMetadata> foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::metadata_scan_rowgroup_interval ( const std::map< int, std::shared_ptr< ParquetEncoder >> &  encoder_map,
const RowGroupInterval &  row_group_interval,
const ReaderPtr &  reader,
const ForeignTableSchema &  schema 
)

Definition at line 1676 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnId, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getLogicalAndPhysicalColumns(), foreign_storage::ForeignTableSchema::getLogicalColumn(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::RowGroupInterval::start_index.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1680  {
1681  std::list<RowGroupMetadata> row_group_metadata;
1682  auto column_interval =
1683  Interval<ColumnType>{schema.getLogicalAndPhysicalColumns().front()->columnId,
1684  schema.getLogicalAndPhysicalColumns().back()->columnId};
1685 
1686  auto file_metadata = reader->parquet_reader()->metadata();
1687  for (int row_group = row_group_interval.start_index;
1688  row_group <= row_group_interval.end_index;
1689  ++row_group) {
1690  auto& row_group_metadata_item = row_group_metadata.emplace_back();
1691  row_group_metadata_item.row_group_index = row_group;
1692  row_group_metadata_item.file_path = row_group_interval.file_path;
1693 
1694  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1695  file_metadata->RowGroup(row_group);
1696 
1697  for (int column_id = column_interval.start; column_id <= column_interval.end;
1698  column_id++) {
1699  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1700  auto parquet_column_index = schema.getParquetColumnIndex(column_id);
1701  auto encoder_map_iter =
1702  encoder_map.find(schema.getLogicalColumn(column_id)->columnId);
1703  CHECK(encoder_map_iter != encoder_map.end());
1704  try {
1705  auto metadata = encoder_map_iter->second->getRowGroupMetadata(
1706  group_metadata.get(), parquet_column_index, column_descriptor->columnType);
1707  row_group_metadata_item.column_chunk_metadata.emplace_back(metadata);
1708  } catch (const std::exception& e) {
1709  std::stringstream error_message;
1710  error_message << e.what() << " in row group " << row_group << " of Parquet file '"
1711  << row_group_interval.file_path << "'.";
1712  throw std::runtime_error(error_message.str());
1713  }
1714  }
1715  }
1716  return row_group_metadata;
1717 }
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_import ( const std::map< int, Chunk_NS::Chunk chunks,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const std::map< int, StringDictionary * >  column_dictionaries,
const int64_t  num_rows,
const RenderGroupAnalyzerMap render_group_analyzer_map 
)

Definition at line 1719 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_import(), shared::get_from_map(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), and foreign_storage::ForeignTableSchema::getParquetColumnIndex().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups().

1725  {
1726  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1727  auto file_metadata = reader->parquet_reader()->metadata();
1728  for (auto& [column_id, chunk] : chunks) {
1729  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1730  if (column_descriptor->isGeoPhyCol) { // skip physical columns
1731  continue;
1732  }
1733  auto parquet_column_descriptor =
1734  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1735  auto find_it = column_dictionaries.find(column_id);
1736  StringDictionary* dictionary =
1737  (find_it == column_dictionaries.end() ? nullptr : find_it->second);
1738  std::list<Chunk_NS::Chunk> chunks_for_import;
1739  chunks_for_import.push_back(chunk);
1740  if (column_descriptor->columnType.is_geometry()) {
1741  for (int i = 0; i < column_descriptor->columnType.get_physical_cols(); ++i) {
1742  chunks_for_import.push_back(chunks.at(column_id + i + 1));
1743  }
1744  }
1745  encoder_map[column_id] = create_parquet_encoder_for_import(chunks_for_import,
1746  column_descriptor,
1747  parquet_column_descriptor,
1748  dictionary,
1749  render_group_analyzer_map);
1750 
1751  // reserve space in buffer when num-elements known ahead of time for types
1752  // of known size (for example dictionary encoded strings)
1753  auto encoder = shared::get_from_map(encoder_map, column_id);
1754  if (auto inplace_encoder = dynamic_cast<ParquetInPlaceEncoder*>(encoder.get())) {
1755  inplace_encoder->reserve(num_rows);
1756  }
1757  }
1758  return encoder_map;
1759 }
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_import(std::list< Chunk_NS::Chunk > &chunks, const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, StringDictionary *string_dictionary, const RenderGroupAnalyzerMap *render_group_analyzer_map)
V & get_from_map(std::map< K, V, comp > &map, const K &key)
Definition: misc.h:61

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::map<int, std::shared_ptr<ParquetEncoder> > foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_metadata_scan ( const Interval< ColumnType > &  column_interval,
const ForeignTableSchema &  schema,
const ReaderPtr &  reader,
const RenderGroupAnalyzerMap render_group_analyzer_map,
const bool  do_metadata_stats_validation 
)

Definition at line 1761 of file LazyParquetChunkLoader.cpp.

References create_parquet_encoder_for_metadata_scan(), foreign_storage::Interval< T >::end, shared::get_from_map(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), and foreign_storage::Interval< T >::start.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1766  {
1767  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1768  auto file_metadata = reader->parquet_reader()->metadata();
1769  for (int column_id = column_interval.start; column_id <= column_interval.end;
1770  column_id++) {
1771  const auto column_descriptor = schema.getColumnDescriptor(column_id);
1772  auto parquet_column_descriptor =
1773  file_metadata->schema()->Column(schema.getParquetColumnIndex(column_id));
1774  encoder_map[column_id] = create_parquet_encoder_for_metadata_scan(
1775  column_descriptor, parquet_column_descriptor, render_group_analyzer_map);
1776  if (!do_metadata_stats_validation) {
1777  shared::get_from_map(encoder_map, column_id)->disableMetadataStatsValidation();
1778  }
1779  column_id += column_descriptor->columnType.get_physical_cols();
1780  }
1781  return encoder_map;
1782 }
T const end
Definition: Intervals.h:68
V & get_from_map(std::map< K, V, comp > &map, const K &key)
Definition: misc.h:61
std::shared_ptr< ParquetEncoder > create_parquet_encoder_for_metadata_scan(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column, const RenderGroupAnalyzerMap *render_group_analyzer_map)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::resize_values_buffer ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column,
std::vector< int8_t > &  values 
)

Definition at line 1136 of file LazyParquetChunkLoader.cpp.

References foreign_storage::LazyParquetChunkLoader::batch_reader_num_elements, ColumnDescriptor::columnType, and SQLTypeInfo::get_size().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1138  {
1139  auto max_type_byte_size =
1140  std::max(omnisci_column->columnType.get_size(),
1141  parquet::GetTypeByteSize(parquet_column->physical_type()));
1142  size_t values_size =
1143  LazyParquetChunkLoader::batch_reader_num_elements * max_type_byte_size;
1144  values.resize(values_size);
1145 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:393
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_boolean_type_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1325 of file LazyParquetChunkLoader.cpp.

References kBOOLEAN, kENCODING_NONE, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1326  {
1327  SQLTypeInfo type;
1329  type.set_type(kBOOLEAN);
1330  type.set_fixed_size();
1331  return type;
1332 }
void set_compression(EncodingType c)
Definition: sqltypes.h:504
void set_fixed_size()
Definition: sqltypes.h:502
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:493

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_column_scalar_type ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1530 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_string(), suggest_boolean_type_mapping(), suggest_date_mapping(), suggest_decimal_mapping(), suggest_floating_point_mapping(), suggest_integral_mapping(), suggest_string_mapping(), suggest_time_mapping(), and suggest_timestamp_mapping().

Referenced by foreign_storage::LazyParquetChunkLoader::suggestColumnMapping().

1530  {
1531  // decimal case
1532  if (parquet_column->logical_type()->is_decimal()) {
1533  return suggest_decimal_mapping(parquet_column);
1534  }
1535  // float case
1536  if (parquet_column->logical_type()->is_none() &&
1537  (parquet_column->physical_type() == parquet::Type::FLOAT ||
1538  parquet_column->physical_type() == parquet::Type::DOUBLE)) {
1539  return suggest_floating_point_mapping(parquet_column);
1540  }
1541  // integral case
1542  if ((parquet_column->logical_type()->is_none() &&
1543  (parquet_column->physical_type() == parquet::Type::INT32 ||
1544  parquet_column->physical_type() == parquet::Type::INT64)) ||
1545  parquet_column->logical_type()->is_int()) {
1546  return suggest_integral_mapping(parquet_column);
1547  }
1548  // boolean case
1549  if (parquet_column->logical_type()->is_none() &&
1550  parquet_column->physical_type() == parquet::Type::BOOLEAN) {
1551  return suggest_boolean_type_mapping(parquet_column);
1552  }
1553  // timestamp case
1554  if (parquet_column->logical_type()->is_timestamp()) {
1555  return suggest_timestamp_mapping(parquet_column);
1556  }
1557  // time case
1558  if (parquet_column->logical_type()->is_time()) {
1559  return suggest_time_mapping(parquet_column);
1560  }
1561  // date case
1562  if (parquet_column->logical_type()->is_date()) {
1563  return suggest_date_mapping(parquet_column);
1564  }
1565  // string case
1566  if (is_valid_parquet_string(parquet_column)) {
1567  return suggest_string_mapping(parquet_column);
1568  }
1569 
1570  throw ForeignStorageException("Unsupported data type detected for column: " +
1571  parquet_column->ToString());
1572 }
SQLTypeInfo suggest_decimal_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_timestamp_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_string_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_date_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_floating_point_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_integral_mapping(const parquet::ColumnDescriptor *parquet_column)
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_boolean_type_mapping(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo suggest_time_mapping(const parquet::ColumnDescriptor *parquet_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_date_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1429 of file LazyParquetChunkLoader.cpp.

References CHECK, kDATE, kENCODING_NONE, and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1429  {
1430  CHECK(parquet_column->logical_type()->is_date());
1431  SQLTypeInfo type;
1432  type.set_type(kDATE);
1433  type.set_compression(kENCODING_NONE);
1434  type.set_fixed_size();
1435  return type;
1436 }
Definition: sqltypes.h:70
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_decimal_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1161 of file LazyParquetChunkLoader.cpp.

References kDECIMAL, kENCODING_NONE, sql_constants::kMaxNumericPrecision, SQLTypeInfo::scale, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_precision(), SQLTypeInfo::set_scale(), SQLTypeInfo::set_type(), to_string(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1161  {
1162  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1163  parquet_column->logical_type().get())) {
1164  auto parquet_precision = decimal_logical_column->precision();
1165  auto parquet_scale = decimal_logical_column->scale();
1166  if (parquet_precision > sql_constants::kMaxNumericPrecision) {
1167  throw ForeignStorageException(
1168  "Parquet column \"" + parquet_column->ToString() +
1169  "\" has decimal precision of " + std::to_string(parquet_precision) +
1170  " which is too high to import, maximum precision supported is " +
1172  }
1173  SQLTypeInfo type;
1174  type.set_type(kDECIMAL);
1176  type.set_precision(parquet_precision);
1177  type.set_scale(parquet_scale);
1178  type.set_fixed_size();
1179  return type;
1180  }
1181  UNREACHABLE()
1182  << " a Parquet column's decimal logical type failed to be read appropriately";
1183  return {};
1184 }
void set_compression(EncodingType c)
Definition: sqltypes.h:504
static constexpr int32_t kMaxNumericPrecision
Definition: sqltypes.h:48
#define UNREACHABLE()
Definition: Logger.h:337
std::string to_string(char const *&&v)
void set_fixed_size()
Definition: sqltypes.h:502
void set_scale(int s)
Definition: sqltypes.h:498
void set_precision(int d)
Definition: sqltypes.h:496
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:493

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_floating_point_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1202 of file LazyParquetChunkLoader.cpp.

References kDOUBLE, kENCODING_NONE, kFLOAT, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1203  {
1204  SQLTypeInfo type;
1205  if (parquet_column->physical_type() == parquet::Type::FLOAT) {
1206  type.set_type(kFLOAT);
1207  } else if (parquet_column->physical_type() == parquet::Type::DOUBLE) {
1208  type.set_type(kDOUBLE);
1209  } else {
1210  UNREACHABLE();
1211  }
1213  type.set_fixed_size();
1214  return type;
1215 }
void set_compression(EncodingType c)
Definition: sqltypes.h:504
#define UNREACHABLE()
Definition: Logger.h:337
void set_fixed_size()
Definition: sqltypes.h:502
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:493

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_integral_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1244 of file LazyParquetChunkLoader.cpp.

References CHECK, kBIGINT, kENCODING_NONE, kINT, kSMALLINT, kTINYINT, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), run_benchmark_import::type, and within_range().

Referenced by suggest_column_scalar_type().

1244  {
1245  SQLTypeInfo type;
1247  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1248  parquet_column->logical_type().get())) {
1249  auto bit_width = int_logical_column->bit_width();
1250  if (!int_logical_column->is_signed()) {
1251  if (within_range(33, 64, bit_width)) {
1252  throw ForeignStorageException(
1253  "Unsigned integer column \"" + parquet_column->name() +
1254  "\" in Parquet file with 64 bit-width has no supported type for ingestion "
1255  "that will not result in data loss");
1256  } else if (within_range(17, 32, bit_width)) {
1257  type.set_type(kBIGINT);
1258  } else if (within_range(9, 16, bit_width)) {
1259  type.set_type(kINT);
1260  } else if (within_range(0, 8, bit_width)) {
1261  type.set_type(kSMALLINT);
1262  }
1263  } else {
1264  if (within_range(33, 64, bit_width)) {
1265  type.set_type(kBIGINT);
1266  } else if (within_range(17, 32, bit_width)) {
1267  type.set_type(kINT);
1268  } else if (within_range(9, 16, bit_width)) {
1269  type.set_type(kSMALLINT);
1270  } else if (within_range(0, 8, bit_width)) {
1271  type.set_type(kTINYINT);
1272  }
1273  }
1274  type.set_fixed_size();
1275  return type;
1276  }
1277 
1278  CHECK(parquet_column->logical_type()->is_none());
1279  if (parquet_column->physical_type() == parquet::Type::INT32) {
1280  type.set_type(kINT);
1281  } else {
1282  CHECK(parquet_column->physical_type() == parquet::Type::INT64);
1283  type.set_type(kBIGINT);
1284  }
1285  type.set_fixed_size();
1286  return type;
1287 }
void set_compression(EncodingType c)
Definition: sqltypes.h:504
void set_fixed_size()
Definition: sqltypes.h:502
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqltypes.h:62
bool within_range(int64_t lower_bound, int64_t upper_bound, int64_t value)
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:493

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_string_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1446 of file LazyParquetChunkLoader.cpp.

References CHECK, is_valid_parquet_string(), kENCODING_DICT, kTEXT, SQLTypeInfo::set_comp_param(), SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_type(), and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1446  {
1447  CHECK(is_valid_parquet_string(parquet_column));
1448  SQLTypeInfo type;
1449  type.set_type(kTEXT);
1451  type.set_comp_param(32);
1452  type.set_fixed_size();
1453  return type;
1454 }
void set_compression(EncodingType c)
Definition: sqltypes.h:504
void set_fixed_size()
Definition: sqltypes.h:502
void set_comp_param(int p)
Definition: sqltypes.h:505
Definition: sqltypes.h:69
#define CHECK(condition)
Definition: Logger.h:291
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:493

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_time_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1403 of file LazyParquetChunkLoader.cpp.

References CHECK, kENCODING_NONE, kTIME, and run_benchmark_import::type.

Referenced by suggest_column_scalar_type().

1403  {
1404  CHECK(parquet_column->logical_type()->is_time());
1405  SQLTypeInfo type;
1406  type.set_type(kTIME);
1407  type.set_compression(kENCODING_NONE);
1408  type.set_fixed_size();
1409  return type;
1410 }
Definition: sqltypes.h:66
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

SQLTypeInfo foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_timestamp_mapping ( const parquet::ColumnDescriptor *  parquet_column)

Definition at line 1369 of file LazyParquetChunkLoader.cpp.

References is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_NONE, kTIMESTAMP, SQLTypeInfo::set_compression(), SQLTypeInfo::set_fixed_size(), SQLTypeInfo::set_precision(), SQLTypeInfo::set_type(), run_benchmark_import::type, and UNREACHABLE.

Referenced by suggest_column_scalar_type().

1369  {
1370  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1371  parquet_column->logical_type().get())) {
1372  SQLTypeInfo type;
1373  type.set_type(kTIMESTAMP);
1375  if (is_nanosecond_precision(timestamp_logical_column)) {
1376  type.set_precision(9);
1377  } else if (is_microsecond_precision(timestamp_logical_column)) {
1378  type.set_precision(6);
1379  } else if (is_millisecond_precision(timestamp_logical_column)) {
1380  type.set_precision(3);
1381  }
1382  type.set_fixed_size();
1383  return type;
1384  }
1385  UNREACHABLE();
1386  return {};
1387 }
void set_compression(EncodingType c)
Definition: sqltypes.h:504
#define UNREACHABLE()
Definition: Logger.h:337
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
void set_fixed_size()
Definition: sqltypes.h:502
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
void set_precision(int d)
Definition: sqltypes.h:496
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:493

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_missing_metadata_error ( const int  row_group_index,
const int  column_index,
const std::string &  file_path 
)

Definition at line 1584 of file LazyParquetChunkLoader.cpp.

References to_string().

Referenced by validate_column_mapping_and_row_group_metadata().

1586  {
1587  throw std::runtime_error{
1588  "Statistics metadata is required for all row groups. Metadata is missing for "
1589  "row group index: " +
1590  std::to_string(row_group_index) +
1591  ", column index: " + std::to_string(column_index) + ", file path: " + file_path};
1592 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_row_group_larger_than_fragment_size_error ( const MaxRowGroupSizeStats  max_row_group_stats,
const int  fragment_size 
)

Definition at line 1600 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::file_path, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::max_row_group_index, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::MaxRowGroupSizeStats::max_row_group_size, and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1602  {
1603  auto metadata_scan_exception = MetadataScanInfeasibleFragmentSizeException{
1604  "Parquet file has a row group size that is larger than the fragment size. "
1605  "Please set the table fragment size to a number that is larger than the "
1606  "row group size. Row group index: " +
1607  std::to_string(max_row_group_stats.max_row_group_index) +
1608  ", row group size: " + std::to_string(max_row_group_stats.max_row_group_size) +
1609  ", fragment size: " + std::to_string(fragment_size) +
1610  ", file path: " + max_row_group_stats.file_path};
1611  metadata_scan_exception.min_feasible_fragment_size_ =
1612  max_row_group_stats.max_row_group_size;
1613  throw metadata_scan_exception;
1614 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping ( const parquet::ColumnDescriptor *  parquet_column,
const ColumnDescriptor omnisci_column 
)

Definition at line 1499 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_type_name(), foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported(), LOG, run_benchmark_import::type, and logger::WARNING.

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_column_mapping_and_row_group_metadata().

1500  {
1501  parquet::Type::type physical_type = parquet_column->physical_type();
1502  auto logical_type = parquet_column->logical_type();
1503  bool allowed_type =
1504  LazyParquetChunkLoader::isColumnMappingSupported(omnisci_column, parquet_column);
1505  if (!allowed_type) {
1506  if (logical_type->is_timestamp()) {
1507  auto timestamp_type =
1508  dynamic_cast<const parquet::TimestampLogicalType*>(logical_type.get());
1509  CHECK(timestamp_type);
1510 
1511  if (!timestamp_type->is_adjusted_to_utc()) {
1512  LOG(WARNING) << "Non-UTC timezone specified in Parquet file for column \""
1513  << omnisci_column->columnName
1514  << "\". Only UTC timezone is currently supported.";
1515  }
1516  }
1517  std::string parquet_type;
1518  if (parquet_column->logical_type()->is_none()) {
1519  parquet_type = parquet::TypeToString(physical_type);
1520  } else {
1521  parquet_type = logical_type->ToString();
1522  }
1523  std::string omnisci_type = omnisci_column->columnType.get_type_name();
1524  throw std::runtime_error{"Conversion from Parquet type \"" + parquet_type +
1525  "\" to HeavyDB type \"" + omnisci_type +
1526  "\" is not allowed. Please use an appropriate column type."};
1527  }
1528 }
#define LOG(tag)
Definition: Logger.h:285
std::string get_type_name() const
Definition: sqltypes.h:507
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1456 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, foreign_storage::get_sub_type_column_descriptor(), SQLTypeInfo::is_array(), is_valid_parquet_list_column(), and foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1457  {
1458  if (is_valid_parquet_list_column(parquet_column) &&
1459  omnisci_column->columnType.is_array()) {
1460  auto omnisci_column_sub_type_column = get_sub_type_column_descriptor(omnisci_column);
1461  return LazyParquetChunkLoader::isColumnMappingSupported(
1462  omnisci_column_sub_type_column.get(), parquet_column);
1463  }
1464  return false;
1465 }
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
SQLTypeInfo columnType
bool is_array() const
Definition: sqltypes.h:588

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

MaxRowGroupSizeStats foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_column_mapping_and_row_group_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1616 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::getLogicalColumns(), is_valid_parquet_list_column(), throw_missing_metadata_error(), and validate_allowed_mapping().

Referenced by validate_parquet_metadata().

1619  {
1620  auto column_it = schema.getLogicalColumns().begin();
1621  MaxRowGroupSizeStats max_row_group_stats{0, 0};
1622  for (int i = 0; i < file_metadata->num_columns(); ++i, ++column_it) {
1623  const parquet::ColumnDescriptor* descr = file_metadata->schema()->Column(i);
1624  try {
1625  validate_allowed_mapping(descr, *column_it);
1626  } catch (std::runtime_error& e) {
1627  std::stringstream error_message;
1628  error_message << e.what() << " Parquet column: " << descr->name()
1629  << ", HeavyDB column: " << (*column_it)->columnName
1630  << ", Parquet file: " << file_path << ".";
1631  throw std::runtime_error(error_message.str());
1632  }
1633 
1634  for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
1635  auto group_metadata = file_metadata->RowGroup(r);
1636  auto num_rows = group_metadata->num_rows();
1637  if (num_rows == 0) {
1638  continue;
1639  } else if (num_rows > max_row_group_stats.max_row_group_size) {
1640  max_row_group_stats.max_row_group_size = num_rows;
1641  max_row_group_stats.max_row_group_index = r;
1642  max_row_group_stats.file_path = file_path;
1643  }
1644 
1645  auto column_chunk = group_metadata->ColumnChunk(i);
1646  bool contains_metadata = column_chunk->is_stats_set();
1647  if (contains_metadata) {
1648  auto stats = column_chunk->statistics();
1649  bool is_all_nulls = stats->null_count() == column_chunk->num_values();
1650  bool is_list = is_valid_parquet_list_column(file_metadata->schema()->Column(i));
1651  // Given a list, it is possible it has no min or max if it is comprised
1652  // only of empty lists & nulls. This can not be detected by comparing
1653  // the null count; therefore we afford list types the benefit of the
1654  // doubt in this situation.
1655  if (!(stats->HasMinMax() || is_all_nulls || is_list)) {
1656  contains_metadata = false;
1657  }
1658  }
1659 
1660  if (!contains_metadata) {
1661  throw_missing_metadata_error(r, i, file_path);
1662  }
1663  }
1664  }
1665  return max_row_group_stats;
1666 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
void throw_missing_metadata_error(const int row_group_index, const int column_index, const std::string &file_path)
void validate_allowed_mapping(const parquet::ColumnDescriptor *parquet_column, const ColumnDescriptor *omnisci_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_date_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1412 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kDATE, kENCODING_DATE_IN_DAYS, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1413  {
1414  if (!(omnisci_column->columnType.get_type() == kDATE &&
1415  ((omnisci_column->columnType.get_compression() == kENCODING_DATE_IN_DAYS &&
1416  (omnisci_column->columnType.get_comp_param() ==
1417  0 // DATE ENCODING DAYS (32) specifies comp_param of 0
1418  || omnisci_column->columnType.get_comp_param() == 16)) ||
1419  omnisci_column->columnType.get_compression() ==
1420  kENCODING_NONE // for array types
1421  ))) {
1422  return false;
1423  }
1424  return parquet_column->logical_type()->is_date() ||
1425  parquet_column->logical_type()
1426  ->is_timestamp(); // to support TIMESTAMP -> DATE coercion
1427 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:381
Definition: sqltypes.h:70
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:392
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_decimal_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1147 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_precision(), SQLTypeInfo::get_scale(), SQLTypeInfo::is_decimal(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1148  {
1149  if (auto decimal_logical_column = dynamic_cast<const parquet::DecimalLogicalType*>(
1150  parquet_column->logical_type().get())) {
1151  return omnisci_column->columnType.get_precision() ==
1152  decimal_logical_column->precision() &&
1153  omnisci_column->columnType.get_scale() == decimal_logical_column->scale() &&
1154  omnisci_column->columnType.is_decimal() &&
1155  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1156  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1157  }
1158  return false;
1159 }
HOST DEVICE int get_scale() const
Definition: sqltypes.h:386
int get_precision() const
Definition: sqltypes.h:384
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
SQLTypeInfo columnType
bool is_decimal() const
Definition: sqltypes.h:583

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_definition_levels ( const parquet::ParquetFileReader *  reader,
const int  row_group_index,
const int  column_index,
const int16_t *  def_levels,
const int64_t  num_levels,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1070 of file LazyParquetChunkLoader.cpp.

References is_valid_parquet_list_column(), and foreign_storage::validate_and_get_column_metadata_statistics().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup().

1076  {
1077  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1078  if (!is_valid_parquet_list) {
1079  return;
1080  }
1081  std::unique_ptr<parquet::RowGroupMetaData> group_metadata =
1082  reader->metadata()->RowGroup(row_group_index);
1083  auto column_metadata = group_metadata->ColumnChunk(column_index);
1084  auto stats = validate_and_get_column_metadata_statistics(column_metadata.get());
1085  if (!stats->HasMinMax()) {
1086  auto find_it = std::find_if(def_levels,
1087  def_levels + num_levels,
1088  [](const int16_t def_level) { return def_level == 3; });
1089  if (find_it != def_levels + num_levels) {
1090  throw std::runtime_error(
1091  "No minimum and maximum statistic set in list column but non-null & non-empty "
1092  "array/value detected.");
1093  }
1094  }
1095 }
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema ( const parquet::arrow::FileReader *  reference_file_reader,
const parquet::arrow::FileReader *  new_file_reader,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 1473 of file LazyParquetChunkLoader.cpp.

References foreign_storage::get_column_descriptor(), to_string(), and foreign_storage::validate_equal_column_descriptor().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan(), and foreign_storage::LazyParquetChunkLoader::previewFiles().

1476  {
1477  const auto reference_num_columns =
1478  reference_file_reader->parquet_reader()->metadata()->num_columns();
1479  const auto new_num_columns =
1480  new_file_reader->parquet_reader()->metadata()->num_columns();
1481  if (reference_num_columns != new_num_columns) {
1482  throw std::runtime_error{"Parquet file \"" + new_file_path +
1483  "\" has a different schema. Please ensure that all Parquet "
1484  "files use the same schema. Reference Parquet file: \"" +
1485  reference_file_path + "\" has " +
1486  std::to_string(reference_num_columns) +
1487  " columns. New Parquet file \"" + new_file_path + "\" has " +
1488  std::to_string(new_num_columns) + " columns."};
1489  }
1490 
1491  for (int i = 0; i < reference_num_columns; i++) {
1492  validate_equal_column_descriptor(get_column_descriptor(reference_file_reader, i),
1493  get_column_descriptor(new_file_reader, i),
1494  reference_file_path,
1495  new_file_path);
1496  }
1497 }
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
std::string to_string(char const *&&v)
const ColumnDescriptor * get_column_descriptor(const shared::ColumnKey &column_key)
Definition: Execute.h:192

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_floating_point_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1186 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), SQLTypeInfo::is_fp(), kENCODING_NONE, and kFLOAT.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1187  {
1188  if (!omnisci_column->columnType.is_fp()) {
1189  return false;
1190  }
1191  // check if mapping is a valid coerced or non-coerced floating point mapping
1192  // with no annotation (floating point columns have no annotation in the
1193  // Parquet specification)
1194  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1195  return (parquet_column->physical_type() == parquet::Type::DOUBLE) ||
1196  (parquet_column->physical_type() == parquet::Type::FLOAT &&
1197  omnisci_column->columnType.get_type() == kFLOAT);
1198  }
1199  return false;
1200 }
bool is_fp() const
Definition: sqltypes.h:584
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:381
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_geospatial_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1467 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::is_geometry(), and is_valid_parquet_string().

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1468  {
1469  return is_valid_parquet_string(parquet_column) &&
1470  omnisci_column->columnType.is_geometry();
1471 }
bool is_geometry() const
Definition: sqltypes.h:592
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_integral_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1217 of file LazyParquetChunkLoader.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_size(), SQLTypeInfo::is_integer(), kENCODING_FIXED, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1218  {
1219  if (!omnisci_column->columnType.is_integer()) {
1220  return false;
1221  }
1222  if (auto int_logical_column = dynamic_cast<const parquet::IntLogicalType*>(
1223  parquet_column->logical_type().get())) {
1224  CHECK(omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1225  omnisci_column->columnType.get_compression() == kENCODING_FIXED);
1226  const int bits_per_byte = 8;
1227  // unsigned types are permitted to map to a wider integral type in order to avoid
1228  // precision loss
1229  const int bit_widening_factor = int_logical_column->is_signed() ? 1 : 2;
1230  return omnisci_column->columnType.get_size() * bits_per_byte <=
1231  int_logical_column->bit_width() * bit_widening_factor;
1232  }
1233  // check if mapping is a valid coerced or non-coerced integral mapping with no
1234  // annotation
1235  if ((omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1236  omnisci_column->columnType.get_compression() == kENCODING_FIXED)) {
1237  return (parquet_column->physical_type() == parquet::Type::INT64) ||
1238  (parquet_column->physical_type() == parquet::Type::INT32 &&
1239  omnisci_column->columnType.get_size() <= 4);
1240  }
1241  return false;
1242 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:393
bool is_integer() const
Definition: sqltypes.h:582
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
#define CHECK(condition)
Definition: Logger.h:291
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_max_repetition_and_definition_level ( const ColumnDescriptor omnisci_column_descriptor,
const parquet::ColumnDescriptor *  parquet_column_descriptor 
)

Definition at line 1097 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::is_array(), is_valid_parquet_list_column(), and to_string().

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::LazyParquetChunkLoader::loadRowGroups().

1099  {
1100  bool is_valid_parquet_list = is_valid_parquet_list_column(parquet_column_descriptor);
1101  if (is_valid_parquet_list && !omnisci_column_descriptor->columnType.is_array()) {
1102  throw std::runtime_error(
1103  "Unsupported mapping detected. Column '" + parquet_column_descriptor->name() +
1104  "' detected to be a parquet list but HeavyDB mapped column '" +
1105  omnisci_column_descriptor->columnName + "' is not an array.");
1106  }
1107  if (is_valid_parquet_list) {
1108  if (parquet_column_descriptor->max_repetition_level() != 1 ||
1109  parquet_column_descriptor->max_definition_level() != 3) {
1110  throw std::runtime_error(
1111  "Incorrect schema max repetition level detected in column '" +
1112  parquet_column_descriptor->name() +
1113  "'. Expected a max repetition level of 1 and max definition level of 3 for "
1114  "list column but column has a max "
1115  "repetition level of " +
1116  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1117  " and a max definition level of " +
1118  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1119  }
1120  } else {
1121  if (parquet_column_descriptor->max_repetition_level() != 0 ||
1122  parquet_column_descriptor->max_definition_level() != 1) {
1123  throw std::runtime_error(
1124  "Incorrect schema max repetition level detected in column '" +
1125  parquet_column_descriptor->name() +
1126  "'. Expected a max repetition level of 0 and max definition level of 1 for "
1127  "flat column but column has a max "
1128  "repetition level of " +
1129  std::to_string(parquet_column_descriptor->max_repetition_level()) +
1130  " and a max definition level of " +
1131  std::to_string(parquet_column_descriptor->max_definition_level()) + ".");
1132  }
1133  }
1134 }
bool is_valid_parquet_list_column(const parquet::ColumnDescriptor *parquet_column)
Detect a valid list parquet column.
std::string to_string(char const *&&v)
SQLTypeInfo columnType
std::string columnName
bool is_array() const
Definition: sqltypes.h:588

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_none_type_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1316 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kBOOLEAN, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1317  {
1318  bool is_none_encoded_mapping =
1319  omnisci_column->columnType.get_compression() == kENCODING_NONE &&
1320  (parquet_column->physical_type() == parquet::Type::BOOLEAN &&
1321  omnisci_column->columnType.get_type() == kBOOLEAN);
1322  return parquet_column->logical_type()->is_none() && is_none_encoded_mapping;
1323 }
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:381
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1574 of file LazyParquetChunkLoader.cpp.

References foreign_storage::ForeignTableSchema::numLogicalColumns(), and foreign_storage::throw_number_of_columns_mismatch_error().

Referenced by foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and validate_parquet_metadata().

1577  {
1578  if (schema.numLogicalColumns() != file_metadata->num_columns()) {
1580  schema.numLogicalColumns(), file_metadata->num_columns(), file_path);
1581  }
1582 }
void throw_number_of_columns_mismatch_error(size_t num_table_cols, size_t num_file_cols, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

MaxRowGroupSizeStats foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_parquet_metadata ( const std::shared_ptr< parquet::FileMetaData > &  file_metadata,
const std::string &  file_path,
const ForeignTableSchema &  schema 
)

Definition at line 1668 of file LazyParquetChunkLoader.cpp.

References validate_column_mapping_and_row_group_metadata(), and validate_number_of_columns().

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan().

1671  {
1672  validate_number_of_columns(file_metadata, file_path, schema);
1673  return validate_column_mapping_and_row_group_metadata(file_metadata, file_path, schema);
1674 }
MaxRowGroupSizeStats validate_column_mapping_and_row_group_metadata(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
void validate_number_of_columns(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_string_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1438 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_compression(), SQLTypeInfo::is_string(), is_valid_parquet_string(), kENCODING_DICT, and kENCODING_NONE.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1439  {
1440  return is_valid_parquet_string(parquet_column) &&
1441  omnisci_column->columnType.is_string() &&
1442  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1443  omnisci_column->columnType.get_compression() == kENCODING_DICT);
1444 }
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
bool is_valid_parquet_string(const parquet::ColumnDescriptor *parquet_column)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:580

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_time_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1389 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_type(), kENCODING_FIXED, kENCODING_NONE, and kTIME.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1390  {
1391  if (!(omnisci_column->columnType.get_type() == kTIME &&
1392  (omnisci_column->columnType.get_compression() == kENCODING_NONE ||
1393  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1394  omnisci_column->columnType.get_comp_param() == 32)))) {
1395  return false;
1396  }
1397  if (parquet_column->logical_type()->is_time()) {
1398  return true;
1399  }
1400  return false;
1401 }
Definition: sqltypes.h:66
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:381
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:392
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_timestamp_mapping ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)

Definition at line 1334 of file LazyParquetChunkLoader.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_compression(), SQLTypeInfo::get_dimension(), SQLTypeInfo::get_type(), is_microsecond_precision(), is_millisecond_precision(), is_nanosecond_precision(), kENCODING_FIXED, kENCODING_NONE, and kTIMESTAMP.

Referenced by foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported().

1335  {
1336  if (!(omnisci_column->columnType.get_type() == kTIMESTAMP &&
1337  ((omnisci_column->columnType.get_compression() == kENCODING_NONE) ||
1338  (omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1339  omnisci_column->columnType.get_comp_param() == 32)))) {
1340  return false;
1341  }
1342  // check the annotated case
1343  if (auto timestamp_logical_column = dynamic_cast<const parquet::TimestampLogicalType*>(
1344  parquet_column->logical_type().get())) {
1345  if (omnisci_column->columnType.get_compression() == kENCODING_NONE) {
1346  return omnisci_column->columnType.get_dimension() == 0 ||
1347  ((is_nanosecond_precision(omnisci_column) &&
1348  is_nanosecond_precision(timestamp_logical_column)) ||
1349  (is_microsecond_precision(omnisci_column) &&
1350  is_microsecond_precision(timestamp_logical_column)) ||
1351  (is_millisecond_precision(omnisci_column) &&
1352  is_millisecond_precision(timestamp_logical_column)));
1353  }
1354  if (omnisci_column->columnType.get_compression() == kENCODING_FIXED) {
1355  return omnisci_column->columnType.get_dimension() == 0;
1356  }
1357  }
1358  // check the unannotated case
1359  if (parquet_column->logical_type()->is_none() &&
1360  ((parquet_column->physical_type() == parquet::Type::INT32 &&
1361  omnisci_column->columnType.get_compression() == kENCODING_FIXED &&
1362  omnisci_column->columnType.get_comp_param() == 32) ||
1363  parquet_column->physical_type() == parquet::Type::INT64)) {
1364  return true;
1365  }
1366  return false;
1367 }
bool is_nanosecond_precision(const ColumnDescriptor *omnisci_column)
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:381
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:383
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:392
bool is_millisecond_precision(const ColumnDescriptor *omnisci_column)
bool is_microsecond_precision(const ColumnDescriptor *omnisci_column)
SQLTypeInfo columnType

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::within_range ( int64_t  lower_bound,
int64_t  upper_bound,
int64_t  value 
)

Definition at line 55 of file LazyParquetChunkLoader.cpp.

References gpu_enabled::upper_bound().

Referenced by suggest_integral_mapping().

55  {
56  return value >= lower_bound && value <= upper_bound;
57 }
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78

+ Here is the call graph for this function:

+ Here is the caller graph for this function: