OmniSciDB  bf83d84833
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
foreign_storage::LazyParquetChunkLoader Class Reference

#include <LazyParquetChunkLoader.h>

Public Member Functions

 LazyParquetChunkLoader (std::shared_ptr< arrow::fs::FileSystem > file_system)
 
std::list< std::unique_ptr
< ChunkMetadata > > 
loadChunk (const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary=nullptr)
 
std::list< RowGroupMetadatametadataScan (const std::set< std::string > &file_paths, const ForeignTableSchema &schema)
 Perform a metadata scan for the paths specified. More...
 

Static Public Member Functions

static bool isColumnMappingSupported (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
 

Static Public Attributes

static const int batch_reader_num_elements = 4096
 

Private Attributes

std::shared_ptr
< arrow::fs::FileSystem > 
file_system_
 

Detailed Description

A lazy parquet to chunk loader

Definition at line 32 of file LazyParquetChunkLoader.h.

Constructor & Destructor Documentation

foreign_storage::LazyParquetChunkLoader::LazyParquetChunkLoader ( std::shared_ptr< arrow::fs::FileSystem >  file_system)

Definition at line 1478 of file LazyParquetChunkLoader.cpp.

1480  : file_system_(file_system) {}
std::shared_ptr< arrow::fs::FileSystem > file_system_

Member Function Documentation

bool foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported ( const ColumnDescriptor omnisci_column,
const parquet::ColumnDescriptor *  parquet_column 
)
static

Determine if a Parquet to OmniSci column mapping is supported.

Parameters
omnisci_column- the column descriptor of the OmniSci column
parquet_column- the column descriptor of the Parquet column
Returns
true if the column mapping is supported by LazyParquetChunkLoader, false otherwise

Definition at line 1442 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_date_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_decimal_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_floating_point_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_geospatial_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_integral_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_none_type_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_string_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_time_mapping(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_timestamp_mapping().

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping().

1444  {
1445  if (validate_geospatial_mapping(omnisci_column, parquet_column)) {
1446  return true;
1447  }
1448  if (validate_array_mapping(omnisci_column, parquet_column)) {
1449  return true;
1450  }
1451  if (validate_decimal_mapping(omnisci_column, parquet_column)) {
1452  return true;
1453  }
1454  if (validate_floating_point_mapping(omnisci_column, parquet_column)) {
1455  return true;
1456  }
1457  if (validate_integral_mapping(omnisci_column, parquet_column)) {
1458  return true;
1459  }
1460  if (validate_none_type_mapping(omnisci_column, parquet_column)) {
1461  return true;
1462  }
1463  if (validate_timestamp_mapping(omnisci_column, parquet_column)) {
1464  return true;
1465  }
1466  if (validate_time_mapping(omnisci_column, parquet_column)) {
1467  return true;
1468  }
1469  if (validate_date_mapping(omnisci_column, parquet_column)) {
1470  return true;
1471  }
1472  if (validate_string_mapping(omnisci_column, parquet_column)) {
1473  return true;
1474  }
1475  return false;
1476 }
bool validate_array_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_time_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_integral_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_date_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_timestamp_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_geospatial_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_decimal_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_none_type_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_floating_point_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
bool validate_string_mapping(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::list< std::unique_ptr< ChunkMetadata > > foreign_storage::LazyParquetChunkLoader::loadChunk ( const std::vector< RowGroupInterval > &  row_group_intervals,
const int  parquet_column_index,
std::list< Chunk_NS::Chunk > &  chunks,
StringDictionary string_dictionary = nullptr 
)

Load a number of row groups of a column in a parquet file into a chunk

Parameters
row_group_interval- an inclusive interval [start,end] that specifies row groups to load
parquet_column_index- the logical column index in the parquet file (and omnisci db) of column to load
chunks- a list containing the chunks to load
string_dictionary- a string dictionary for the column corresponding to the column, if applicable
Returns
An empty list when no metadata update is applicable, otherwise a list of ChunkMetadata shared pointers with which to update the corresponding column chunk metadata. NOTE: Only ChunkMetadata.sqlType and the min & max values of the ChunkMetadata.chunkStats are valid, other values are not set.

NOTE: if more than one chunk is supplied, the first chunk is required to be the chunk corresponding to the logical column, while the remaining chunks correspond to physical columns (in ascending order of column id.) Similarly, if a metada update is expected, the list of ChunkMetadata shared pointers returned will correspond directly to the list chunks.

Definition at line 1482 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), CHECK, and file_system_.

Referenced by foreign_storage::ParquetDataWrapper::loadBuffersUsingLazyParquetChunkLoader().

1486  {
1487  CHECK(!chunks.empty());
1488  auto const& chunk = *chunks.begin();
1489  auto column_descriptor = chunk.getColumnDesc();
1490  auto buffer = chunk.getBuffer();
1491  CHECK(buffer);
1492 
1493  try {
1494  auto metadata = append_row_groups(row_group_intervals,
1495  parquet_column_index,
1496  column_descriptor,
1497  chunks,
1498  string_dictionary,
1499  file_system_);
1500  return metadata;
1501  } catch (const std::exception& error) {
1502  throw ForeignStorageException(error.what());
1503  }
1504 
1505  return {};
1506 }
#define CHECK(condition)
Definition: Logger.h:197
std::list< std::unique_ptr< ChunkMetadata > > append_row_groups(const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, const ColumnDescriptor *column_descriptor, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, std::shared_ptr< arrow::fs::FileSystem > file_system)
std::shared_ptr< arrow::fs::FileSystem > file_system_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::list< RowGroupMetadata > foreign_storage::LazyParquetChunkLoader::metadataScan ( const std::set< std::string > &  file_paths,
const ForeignTableSchema schema 
)

Perform a metadata scan for the paths specified.

Parameters
file_paths- (ordered) files of the metadata scan
schema- schema of the foreign table to perform metadata scan for
Returns
a list of the row group metadata extracted from file_paths

Definition at line 1508 of file LazyParquetChunkLoader.cpp.

References CHECK, file_system_, foreign_storage::get_parquet_table_size(), foreign_storage::ForeignTableSchema::getLogicalAndPhysicalColumns(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::metadata_scan_rowgroup_interval(), foreign_storage::open_parquet_table(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_parquet_metadata().

Referenced by foreign_storage::ParquetDataWrapper::metadataScanFiles().

1510  {
1511  std::list<RowGroupMetadata> row_group_metadata;
1512  auto column_interval =
1513  Interval<ColumnType>{schema.getLogicalAndPhysicalColumns().front()->columnId,
1514  schema.getLogicalAndPhysicalColumns().back()->columnId};
1515  CHECK(!file_paths.empty());
1516  std::unique_ptr<parquet::arrow::FileReader> first_file_reader;
1517  const auto& first_file_path = *file_paths.begin();
1518  open_parquet_table(first_file_path, first_file_reader, file_system_);
1519  std::map<int, std::shared_ptr<ParquetEncoder>> encoder_map;
1520  for (const auto& file_path : file_paths) {
1521  std::unique_ptr<parquet::arrow::FileReader> reader;
1522  open_parquet_table(file_path, reader, file_system_);
1524  first_file_reader.get(), reader.get(), first_file_path, file_path);
1525  int num_row_groups = get_parquet_table_size(reader).first;
1526  auto row_group_interval = RowGroupInterval{file_path, 0, num_row_groups - 1};
1527  validate_parquet_metadata(reader->parquet_reader()->metadata(), file_path, schema);
1528  if (file_path == first_file_path) {
1529  populate_encoder_map(encoder_map, column_interval, schema, first_file_reader);
1530  }
1532  encoder_map, row_group_interval, reader, schema, row_group_metadata);
1533  }
1534  return row_group_metadata;
1535 }
void validate_parquet_metadata(const std::shared_ptr< parquet::FileMetaData > &file_metadata, const std::string &file_path, const ForeignTableSchema &schema)
void metadata_scan_rowgroup_interval(const std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const RowGroupInterval &row_group_interval, const std::unique_ptr< parquet::arrow::FileReader > &reader, const ForeignTableSchema &schema, std::list< RowGroupMetadata > &row_group_metadata)
void validate_equal_schema(const parquet::arrow::FileReader *reference_file_reader, const parquet::arrow::FileReader *new_file_reader, const std::string &reference_file_path, const std::string &new_file_path)
std::pair< int, int > get_parquet_table_size(const std::unique_ptr< parquet::arrow::FileReader > &reader)
void populate_encoder_map(std::map< int, std::shared_ptr< ParquetEncoder >> &encoder_map, const Interval< ColumnType > &column_interval, const ForeignTableSchema &schema, const std::unique_ptr< parquet::arrow::FileReader > &reader)
#define CHECK(condition)
Definition: Logger.h:197
void open_parquet_table(const std::string &file_path, std::unique_ptr< parquet::arrow::FileReader > &reader, std::shared_ptr< arrow::fs::FileSystem > &file_system)
std::shared_ptr< arrow::fs::FileSystem > file_system_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Member Data Documentation

const int foreign_storage::LazyParquetChunkLoader::batch_reader_num_elements = 4096
static
std::shared_ptr<arrow::fs::FileSystem> foreign_storage::LazyParquetChunkLoader::file_system_
private

Definition at line 94 of file LazyParquetChunkLoader.h.

Referenced by loadChunk(), and metadataScan().


The documentation for this class was generated from the following files: