OmniSciDB  95562058bd
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
foreign_storage::LazyParquetChunkLoader Class Reference

#include <LazyParquetChunkLoader.h>

Public Member Functions

 LazyParquetChunkLoader (std::shared_ptr< arrow::fs::FileSystem > file_system)
 
std::list< std::unique_ptr
< ChunkMetadata > > 
loadChunk (const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary=nullptr)
 
std::list< RowGroupMetadatametadataScan (const std::set< std::string > &file_paths, const ForeignTableSchema &schema)
 Perform a metadata scan for the paths specified. More...
 

Static Public Member Functions

static bool isColumnMappingSupported (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 

Static Public Attributes

static const int batch_reader_num_elements = 4096
 

Private Attributes

std::shared_ptr
< arrow::fs::FileSystem > 
file_system_
 

Detailed Description

A lazy parquet to chunk loader

Definition at line 32 of file LazyParquetChunkLoader.h.

Constructor & Destructor Documentation

foreign_storage::LazyParquetChunkLoader::LazyParquetChunkLoader ( std::shared_ptr< arrow::fs::FileSystem >  file_system)

Definition at line 1105 of file LazyParquetChunkLoader.cpp.

1107  : file_system_(file_system) {}
std::shared_ptr< arrow::fs::FileSystem > file_system_

Member Function Documentation

bool foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)
static

Determine if a Parquet to OmniSci column mapping is supported.

Parameters
omnisci_column- the column descriptor of the OmniSci column
parquet_column- the column descriptor of the Parquet column
Returns
true if the column mapping is supported by LazyParquetChunkLoader, false otherwise

Definition at line 1072 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_date_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_decimal_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_geospatial_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_integral_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_none_type_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_string_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_time_mapping(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_timestamp_mapping().

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping().

1074  {
1075  if (validate_geospatial_mapping(omnisci_column, parquet_column)) {
1076  return true;
1077  }
1078  if (validate_array_mapping(omnisci_column, parquet_column)) {
1079  return true;
1080  }
1081  if (validate_decimal_mapping(omnisci_column, parquet_column)) {
1082  return true;
1083  }
1084  if (validate_integral_mapping(omnisci_column, parquet_column)) {
1085  return true;
1086  }
1087  if (validate_none_type_mapping(omnisci_column, parquet_column)) {
1088  return true;
1089  }
1090  if (validate_timestamp_mapping(omnisci_column, parquet_column)) {
1091  return true;
1092  }
1093  if (validate_time_mapping(omnisci_column, parquet_column)) {
1094  return true;
1095  }
1096  if (validate_date_mapping(omnisci_column, parquet_column)) {
1097  return true;
1098  }
1099  if (validate_string_mapping(omnisci_column, parquet_column)) {
1100  return true;
1101  }
1102  return false;
1103 }
bool validate_array_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_time_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_integral_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_date_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_timestamp_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_geospatial_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_decimal_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_none_type_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_string_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::list< std::unique_ptr< ChunkMetadata > > foreign_storage::LazyParquetChunkLoader::loadChunk ( const std::vector< RowGroupInterval > &  row_group_intervals,
const int  parquet_column_index,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary = nullptr 
)

Load a number of row groups of a column in a parquet file into a chunk

Parameters
row_group_interval- an inclusive interval [start,end] that specifies row groups to load
parquet_column_index- the logical column index in the parquet file (and omnisci db) of column to load
chunks- a list containing the chunks to load
string_dictionary- a string dictionary for the column corresponding to the column, if applicable
Returns
An empty list when no metadata update is applicable, otherwise a list of ChunkMetadata shared pointers with which to update the corresponding column chunk metadata. NOTE: Only ChunkMetadata.sqlType and the min & max values of the ChunkMetadata.chunkStats are valid, other values are not set.

NOTE: if more than one chunk is supplied, the first chunk is required to be the chunk corresponding to the logical column, while the remaining chunks correspond to physical columns (in ascending order of column id.) Similarly, if a metada update is expected, the list of ChunkMetadata shared pointers returned will correspond directly to the list chunks.

Definition at line 1109 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), CHECK, and file_system_.

Referenced by foreign_storage::ParquetDataWrapper::loadBuffersUsingLazyParquetChunkLoader().

1113  {
1114  CHECK(!chunks.empty());
1115  auto const& chunk = *chunks.begin();
1116  auto column_descriptor = chunk.getColumnDesc();
1117  auto buffer = chunk.getBuffer();
1118  CHECK(buffer);
1119 
1120  auto metadata = append_row_groups(row_group_intervals,
1121  parquet_column_index,
1122  column_descriptor,
1123  chunks,
1124  string_dictionary,
1125  file_system_);
1126  return metadata;
1127 }
#define CHECK(condition)
Definition: Logger.h:197
std::list< std::unique_ptr< ChunkMetadata > > append_row_groups(const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, const ColumnDescriptor *column_descriptor, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::shared_ptr< arrow::fs::FileSystem > file_system)
std::shared_ptr< arrow::fs::FileSystem > file_system_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::list< RowGroupMetadata > foreign_storage::LazyParquetChunkLoader::metadataScan ( const std::set< std::string > &  file_paths,
const ForeignTableSchema schema 
)

Perform a metadata scan for the paths specified.

Parameters
file_paths- (ordered) files of the metadata scan
schema- schema of the foreign table to perform metadata scan for
Returns
a list of the row group metadata extracted from file_paths

Definition at line 1129 of file LazyParquetChunkLoader.cpp.

References CHECK, file_system_, foreign_storage::get_parquet_table_size(), foreign_storage::ForeignTableSchema::getLogicalAndPhysicalColumns(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::metadata_scan_rowgroup_interval(), foreign_storage::open_parquet_table(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_parquet_metadata().

Referenced by foreign_storage::ParquetDataWrapper::metadataScanFiles().

1131  {
1132  std::list<RowGroupMetadata> row_group_metadata;
1133  auto column_interval =
1134  Interval<ColumnType>{schema.getLogicalAndPhysicalColumns().front()->columnId,
1135  schema.getLogicalAndPhysicalColumns().back()->columnId};
1136  CHECK(!file_paths.empty());
1137  std::unique_ptr<parquet::arrow::FileReader> first_file_reader;
1138  const auto& first_file_path = *file_paths.begin();
1139  open_parquet_table(first_file_path, first_file_reader, file_system_);
1140  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1141  for (const auto& file_path : file_paths) {
1142  std::unique_ptr<parquet::arrow::FileReader> reader;
1143  open_parquet_table(file_path, reader, file_system_);
1145  first_file_reader.get(), reader.get(), first_file_path, file_path);
1146  int num_row_groups = get_parquet_table_size(reader).first;
1147  auto row_group_interval = RowGroupInterval{file_path, 0, num_row_groups - 1};
1148  validate_parquet_metadata(reader->parquet_reader()->metadata(), file_path, schema);
1149  if (file_path == first_file_path) {
1150  populate_encoder_map(encoder_map, column_interval, schema, first_file_reader);
1151  }
1153  encoder_map, row_group_interval, reader, schema, row_group_metadata);
1154  }
1155  return row_group_metadata;
1156 }
void validate_parquet_metadata(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
void metadata_scan_rowgroup_interval(const std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const RowGroupInterval &row_group_interval, const std::unique_ptr< parquet::arrow::FileReader > &reader, const ForeignTableSchema &schema, std::list< RowGroupMetadata > &row_group_metadata)
void validate_equal_schema(const parquet::arrow::FileReader *reference_file_reader, const parquet::arrow::FileReader *new_file_reader, const std::string &reference_file_path, const std::string &new_file_path)
std::pair< int, int > get_parquet_table_size(const std::unique_ptr< parquet::arrow::FileReader > &reader)
void populate_encoder_map(std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const Interval< ColumnType > &column_interval, const ForeignTableSchema &schema, const std::unique_ptr< parquet::arrow::FileReader > &reader)
#define CHECK(condition)
Definition: Logger.h:197
void open_parquet_table(const std::string &file_path, std::unique_ptr< parquet::arrow::FileReader > &reader, std::shared_ptr< arrow::fs::FileSystem > &file_system)
std::shared_ptr< arrow::fs::FileSystem > file_system_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Member Data Documentation

const int foreign_storage::LazyParquetChunkLoader::batch_reader_num_elements = 4096
static
std::shared_ptr<arrow::fs::FileSystem> foreign_storage::LazyParquetChunkLoader::file_system_
private

Definition at line 94 of file LazyParquetChunkLoader.h.

Referenced by loadChunk(), and metadataScan().


The documentation for this class was generated from the following files: