#include <LazyParquetChunkLoader.h>

Collaboration diagram for foreign_storage::LazyParquetChunkLoader:

Public Member Functions
	LazyParquetChunkLoader (std::shared_ptr< arrow::fs::FileSystem > file_system, FileReaderMap file_reader_cache, const ForeignTable foreign_table)

std::list< std::unique_ptr < ChunkMetadata > >	loadChunk (const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, std::list< Chunk_NS::Chunk > &chunks, StringDictionary string_dictionary=nullptr, RejectedRowIndices rejected_row_indices=nullptr)

std::list< RowGroupMetadata >	metadataScan (const std::vector< std::string > &file_paths, const ForeignTableSchema &schema, const bool do_metadata_stats_validation=true)
	Perform a metadata scan for the paths specified. More...

std::pair< size_t, size_t >	loadRowGroups (const RowGroupInterval &row_group_interval, const std::map< int, Chunk_NS::Chunk > &chunks, const ForeignTableSchema &schema, const std::map< int, StringDictionary * > &column_dictionaries, const int num_threads=1)
	Load row groups of data into given chunks. More...

DataPreview	previewFiles (const std::vector< std::string > &files, const size_t max_num_rows, const ForeignTable &table)
	Preview rows of data and column types in a set of files. More...

Static Public Member Functions
static bool	isColumnMappingSupported (const ColumnDescriptor omnisci_column, const parquet::ColumnDescriptor parquet_column)

Static Public Attributes
static const int	batch_reader_num_elements = 4096

Private Member Functions
std::list< std::unique_ptr < ChunkMetadata > >	appendRowGroups (const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, const ColumnDescriptor column_descriptor, std::list< Chunk_NS::Chunk > &chunks, StringDictionary string_dictionary, RejectedRowIndices *rejected_row_indices, const bool is_for_detect=false, const std::optional< int64_t > max_levels_read=std::nullopt)

Static Private Member Functions
static SQLTypeInfo	suggestColumnMapping (const parquet::ColumnDescriptor *parquet_column)

Private Attributes
std::shared_ptr < arrow::fs::FileSystem >	file_system_

FileReaderMap *	file_reader_cache_

const ForeignTable *	foreign_table_

Detailed Description

A lazy parquet to chunk loader

Definition at line 37 of file LazyParquetChunkLoader.h.

Constructor & Destructor Documentation

foreign_storage::LazyParquetChunkLoader::LazyParquetChunkLoader	(	std::shared_ptr< arrow::fs::FileSystem >	file_system,
		FileReaderMap *	file_reader_cache,
		const ForeignTable *	foreign_table
	)

Definition at line 2083 of file LazyParquetChunkLoader.cpp.

References CHECK, and foreign_table_.

     : file_system_(file_system)
     , file_reader_cache_(file_map)
     , foreign_table_(foreign_table) {
   CHECK(foreign_table_) << "LazyParquetChunkLoader: null Foreign Table ptr";
 }

Member Function Documentation

std::list< std::unique_ptr< ChunkMetadata > > foreign_storage::LazyParquetChunkLoader::appendRowGroups	(	const std::vector< RowGroupInterval > &	row_group_intervals,
		const int	parquet_column_index,
		const ColumnDescriptor *	column_descriptor,
		std::list< Chunk_NS::Chunk > &	chunks,
		StringDictionary *	string_dictionary,
		RejectedRowIndices *	rejected_row_indices,
		const bool	is_for_detect = `false`,
		const std::optional< int64_t >	max_levels_read = `std::nullopt`
	)

private

Definition at line 1828 of file LazyParquetChunkLoader.cpp.

Referenced by loadChunk(), and previewFiles().

                                                  {
   auto timer = DEBUG_TIMER(__func__);
   std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
   // `def_levels` and `rep_levels` below are used to store the read definition
   // and repetition levels of the Dremel encoding implemented by the Parquet
   // format
   std::vector<int16_t> def_levels(LazyParquetChunkLoader::batch_reader_num_elements);
   std::vector<int16_t> rep_levels(LazyParquetChunkLoader::batch_reader_num_elements);
   std::vector<int8_t> values;
 
   // Timing information used in logging
   Timer<> summary_timer;
   Timer<> initialization_timer_ms;
   Timer<> validation_timer_ms;
   Timer<> parquet_read_timer_ms;
   Timer<> encoding_timer_ms;
   size_t total_row_groups_read = 0;
 
   summary_timer.start();
 
   initialization_timer_ms.start();
   CHECK(!row_group_intervals.empty());
   const auto& first_file_path = row_group_intervals.front().file_path;
 
   auto first_file_reader = file_reader_cache_->getOrInsert(first_file_path, file_system_);
   auto first_parquet_column_descriptor =
       get_column_descriptor(first_file_reader, parquet_column_index);
   resize_values_buffer(column_descriptor, first_parquet_column_descriptor, values);
 
   const bool geo_validate_geometry =
       foreign_table_->getOptionAsBool(ForeignTable::GEO_VALIDATE_GEOMETRY_KEY);
   auto encoder = create_parquet_encoder(column_descriptor,
                                         first_parquet_column_descriptor,
                                         chunks,
                                         string_dictionary,
                                         chunk_metadata,
                                         false,
                                         false,
                                         is_for_detect,
                                         geo_validate_geometry);
   CHECK(encoder.get());
 
   if (rejected_row_indices) {  // error tracking is enabled
     encoder->initializeErrorTracking();
   }
   encoder->initializeColumnType(column_descriptor->columnType);
   initialization_timer_ms.stop();
 
   bool early_exit = false;
   int64_t total_rows_read = 0;
   for (const auto& row_group_interval : row_group_intervals) {
     initialization_timer_ms.start();
     const auto& file_path = row_group_interval.file_path;
     auto file_reader = file_reader_cache_->getOrInsert(file_path, file_system_);
 
     auto [num_row_groups, num_columns] = get_parquet_table_size(file_reader);
     CHECK(row_group_interval.start_index >= 0 &&
           row_group_interval.end_index < num_row_groups);
     CHECK(parquet_column_index >= 0 && parquet_column_index < num_columns);
 
     parquet::ParquetFileReader* parquet_reader = file_reader->parquet_reader();
     auto parquet_column_descriptor =
         get_column_descriptor(file_reader, parquet_column_index);
 
     initialization_timer_ms.stop();
 
     validation_timer_ms.start();
     validate_equal_column_descriptor(first_parquet_column_descriptor,
                                      parquet_column_descriptor,
                                      first_file_path,
                                      file_path);
 
     validate_max_repetition_and_definition_level(column_descriptor,
                                                  parquet_column_descriptor);
     set_definition_levels_for_zero_max_definition_level_case(parquet_column_descriptor,
                                                              def_levels);
     validation_timer_ms.stop();
 
     int64_t values_read = 0;
     for (int row_group_index = row_group_interval.start_index;
          row_group_index <= row_group_interval.end_index;
          ++row_group_index) {
       total_row_groups_read++;
       parquet_read_timer_ms.start();
       auto group_reader = parquet_reader->RowGroup(row_group_index);
       std::shared_ptr<parquet::ColumnReader> col_reader =
           group_reader->Column(parquet_column_index);
       parquet_read_timer_ms.stop();
 
       try {
         while (col_reader->HasNext()) {
           parquet_read_timer_ms.start();
           int64_t levels_read =
               parquet::ScanAllValues(LazyParquetChunkLoader::batch_reader_num_elements,
                                      def_levels.data(),
                                      rep_levels.data(),
                                      reinterpret_cast<uint8_t*>(values.data()),
                                      &values_read,
                                      col_reader.get());
           parquet_read_timer_ms.stop();
 
           encoding_timer_ms.start();
           if (rejected_row_indices) {  // error tracking is enabled
             encoder->appendDataTrackErrors(def_levels.data(),
                                            rep_levels.data(),
                                            values_read,
                                            levels_read,
                                            values.data());
           } else {  // no error tracking enabled
             validate_list_column_metadata_statistics(
                 parquet_reader,  // this validation only in effect for foreign tables
                 row_group_index,
                 parquet_column_index,
                 def_levels.data(),
                 levels_read,
                 parquet_column_descriptor);
 
             encoder->appendData(def_levels.data(),
                                 rep_levels.data(),
                                 values_read,
                                 levels_read,
                                 values.data());
           }
           encoding_timer_ms.stop();
 
           if (max_rows_to_read.has_value()) {
             if (column_descriptor->columnType.is_array()) {
               auto array_encoder =
                   dynamic_cast<ParquetArrayDetectEncoder*>(encoder.get());
               CHECK(array_encoder);
               total_rows_read = array_encoder->getArraysCount();
             } else {
               // For scalar types it is safe to assume the number of levels read is equal
               // to the number of rows read
               total_rows_read += levels_read;
             }
 
             if (total_rows_read >= max_rows_to_read.value()) {
               early_exit = true;
               break;
             }
           }
         }
         encoding_timer_ms.start();
         if (auto array_encoder = dynamic_cast<ParquetArrayEncoder*>(encoder.get())) {
           array_encoder->finalizeRowGroup();
         }
         encoding_timer_ms.stop();
       } catch (const std::exception& error) {
         // check for a specific error to detect a possible unexpected switch of data
         // source in order to respond with informative error message
         if (boost::regex_search(error.what(),
                                 boost::regex{"Deserializing page header failed."})) {
           throw ForeignStorageException(
               "Unable to read from foreign data source, possible cause is an unexpected "
               "change of source. Please use the \"REFRESH FOREIGN TABLES\" command on "
               "the "
               "foreign table "
               "if data source has been updated. Foreign table: " +
               foreign_table_->tableName);
         }
 
         throw ForeignStorageException(
             std::string(error.what()) + " Row group: " + std::to_string(row_group_index) +
             ", Parquet column: '" + col_reader->descr()->path()->ToDotString() +
             "', Parquet file: '" + file_path + "'");
       }
       if (max_rows_to_read.has_value() && early_exit) {
         break;
       }
     }
     if (max_rows_to_read.has_value() && early_exit) {
       break;
     }
   }
 
   encoding_timer_ms.start();
   if (rejected_row_indices) {  // error tracking is enabled
     *rejected_row_indices = encoder->getRejectedRowIndices();
   }
   encoding_timer_ms.stop();
 
   summary_timer.stop();
 
   VLOG(1) << "Appended " << total_row_groups_read
           << " row groups to chunk. Column: " << column_descriptor->columnName
           << ", Column id: " << column_descriptor->columnId << ", Parquet column: "
           << first_parquet_column_descriptor->path()->ToDotString();
   VLOG(1) << "Runtime summary:";
   VLOG(1) << " Parquet chunk loading total time: " << summary_timer.elapsed() << "ms";
   VLOG(1) << " Parquet encoder initialization time: " << initialization_timer_ms.elapsed()
           << "ms";
   VLOG(1) << " Parquet metadata validation time: " << validation_timer_ms.elapsed()
           << "ms";
   VLOG(1) << " Parquet column read time: " << parquet_read_timer_ms.elapsed() << "ms";
   VLOG(1) << " Parquet data conversion time: " << encoding_timer_ms.elapsed() << "ms";
 
   return chunk_metadata;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

bool foreign_storage::LazyParquetChunkLoader::isColumnMappingSupported	(	const ColumnDescriptor *	omnisci_column,
		const parquet::ColumnDescriptor *	parquet_column
	)

static

Determine if a Parquet to OmniSci column mapping is supported.

Parameters

omnisci_column	- the column descriptor of the OmniSci column
parquet_column	- the column descriptor of the Parquet column

Returns: true if the column mapping is supported by LazyParquetChunkLoader, false otherwise

Definition at line 2048 of file LazyParquetChunkLoader.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping().

                                                    {
   CHECK(!omnisci_column->columnType.is_array())
       << "isColumnMappingSupported should not be called on arrays";
   if (validate_geospatial_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   if (validate_decimal_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   if (validate_floating_point_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   if (validate_integral_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   if (validate_none_type_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   if (validate_timestamp_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   if (validate_time_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   if (validate_date_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   if (validate_string_mapping(omnisci_column, parquet_column)) {
     return true;
   }
   return false;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

std::list< std::unique_ptr< ChunkMetadata > > foreign_storage::LazyParquetChunkLoader::loadChunk	(	const std::vector< RowGroupInterval > &	row_group_intervals,
		const int	parquet_column_index,
		std::list< Chunk_NS::Chunk > &	chunks,
		StringDictionary *	string_dictionary = `nullptr`,
		RejectedRowIndices *	rejected_row_indices = `nullptr`
	)

Load a number of row groups of a column in a parquet file into a chunk

Parameters

row_group_interval	- an inclusive interval [start,end] that specifies row groups to load
parquet_column_index	- the logical column index in the parquet file (and omnisci db) of column to load
chunks	- a list containing the chunks to load
string_dictionary	- a string dictionary for the column corresponding to the column, if applicable
rejected_row_indices	- optional, if specified errors will be tracked in this data structure while loading

Returns: An empty list when no metadata update is applicable, otherwise a list of ChunkMetadata shared pointers with which to update the corresponding column chunk metadata.

NOTE: if more than one chunk is supplied, the first chunk is required to be the chunk corresponding to the logical column, while the remaining chunks correspond to physical columns (in ascending order of column id.) Similarly, if a metada update is expected, the list of ChunkMetadata shared pointers returned will correspond directly to the list chunks.

Definition at line 2093 of file LazyParquetChunkLoader.cpp.

References appendRowGroups(), and CHECK.

Referenced by foreign_storage::ParquetDataWrapper::loadBuffersUsingLazyParquetChunkLoader().

                                               {
   CHECK(!chunks.empty());
   auto const& chunk = *chunks.begin();
   auto column_descriptor = chunk.getColumnDesc();
   auto buffer = chunk.getBuffer();
   CHECK(buffer);
 
   try {
     auto metadata = appendRowGroups(row_group_intervals,
                                     parquet_column_index,
                                     column_descriptor,
                                     chunks,
                                     string_dictionary,
                                     rejected_row_indices);
     return metadata;
   } catch (const std::exception& error) {
     throw ForeignStorageException(error.what());
   }
 
   return {};
 }

Here is the call graph for this function:

Here is the caller graph for this function:

std::pair< size_t, size_t > foreign_storage::LazyParquetChunkLoader::loadRowGroups	(	const RowGroupInterval &	row_group_interval,
		const std::map< int, Chunk_NS::Chunk > &	chunks,
		const ForeignTableSchema &	schema,
		const std::map< int, StringDictionary * > &	column_dictionaries,
		const int	num_threads = `1`
	)

Load row groups of data into given chunks.

Parameters

row_group_interval	- specifies which row groups to load
chunks	- map of column index to chunk which data will be loaded into
schema	- schema of the foreign table to perform metadata scan for
column_dictionaries	- a map of string dictionaries for columns that require it
num_threads	- number of threads to utilize while reading (if applicale)

Returns: [num_rows_completed,num_rows_rejected] - returns number of rows loaded and rejected while loading

Note that only logical chunks are expected because the data is read into an intermediate form into the underlying buffers. This member is intended to be used for import.

NOTE: Currently, loading one row group at a time is required.

Definition at line 2203 of file LazyParquetChunkLoader.cpp.

                            {
   auto timer = DEBUG_TIMER(__func__);
 
   const auto& file_path = row_group_interval.file_path;
 
   // do not use caching with file-readers, open a new one for every request
   auto file_reader_owner = open_parquet_table(file_path, file_system_);
   auto file_reader = file_reader_owner.get();
   auto file_metadata = file_reader->parquet_reader()->metadata();
 
   validate_number_of_columns(file_metadata, file_path, schema);
 
   // check for fixed length encoded columns and indicate to the user
   // they should not be used
   for (const auto column_descriptor : schema.getLogicalColumns()) {
     auto parquet_column_index = schema.getParquetColumnIndex(column_descriptor->columnId);
     auto parquet_column = file_metadata->schema()->Column(parquet_column_index);
     try {
       validate_allowed_mapping(parquet_column, column_descriptor);
     } catch (std::runtime_error& e) {
       std::stringstream error_message;
       error_message << e.what()
                     << " Parquet column: " << parquet_column->path()->ToDotString()
                     << ", HeavyDB column: " << column_descriptor->columnName
                     << ", Parquet file: " << file_path << ".";
       throw std::runtime_error(error_message.str());
     }
   }
 
   CHECK(row_group_interval.start_index == row_group_interval.end_index);
   auto row_group_index = row_group_interval.start_index;
   std::map<int, ParquetRowGroupReader> row_group_reader_map;
 
   parquet::ParquetFileReader* parquet_reader = file_reader->parquet_reader();
   auto group_reader = parquet_reader->RowGroup(row_group_index);
 
   std::vector<InvalidRowGroupIndices> invalid_indices_per_thread(num_threads);
 
   const bool geo_validate_geometry =
       foreign_table_->getOptionAsBool(ForeignTable::GEO_VALIDATE_GEOMETRY_KEY);
   auto encoder_map = populate_encoder_map_for_import(chunks,
                                                      schema,
                                                      file_reader,
                                                      column_dictionaries,
                                                      group_reader->metadata()->num_rows(),
                                                      geo_validate_geometry);
 
   std::vector<std::set<int>> partitions(num_threads);
   std::map<int, int> column_id_to_thread;
   for (auto& [column_id, encoder] : encoder_map) {
     auto thread_id = column_id % num_threads;
     column_id_to_thread[column_id] = thread_id;
     partitions[thread_id].insert(column_id);
   }
 
   for (auto& [column_id, encoder] : encoder_map) {
     const auto& column_descriptor = schema.getColumnDescriptor(column_id);
     const auto parquet_column_index = schema.getParquetColumnIndex(column_id);
     auto parquet_column_descriptor =
         file_metadata->schema()->Column(parquet_column_index);
 
     // validate
     auto [num_row_groups, num_columns] = get_parquet_table_size(file_reader);
     CHECK(row_group_interval.start_index >= 0 &&
           row_group_interval.end_index < num_row_groups);
     CHECK(parquet_column_index >= 0 && parquet_column_index < num_columns);
     validate_max_repetition_and_definition_level(column_descriptor,
                                                  parquet_column_descriptor);
 
     std::shared_ptr<parquet::ColumnReader> col_reader =
         group_reader->Column(parquet_column_index);
 
     row_group_reader_map.insert(
         {column_id,
          ParquetRowGroupReader(col_reader,
                                column_descriptor,
                                parquet_column_descriptor,
                                shared::get_from_map(encoder_map, column_id).get(),
                                invalid_indices_per_thread[shared::get_from_map(
                                    column_id_to_thread, column_id)],
                                row_group_index,
                                parquet_column_index,
                                parquet_reader)});
   }
 
   std::vector<std::future<void>> futures;
   for (int ithread = 0; ithread < num_threads; ++ithread) {
     auto column_ids_for_thread = partitions[ithread];
     futures.emplace_back(
         std::async(std::launch::async, [&row_group_reader_map, column_ids_for_thread] {
           for (const auto column_id : column_ids_for_thread) {
             shared::get_from_map(row_group_reader_map, column_id)
                 .readAndValidateRowGroup();  // reads and validate entire row group per
                                              // column
           }
         }));
   }
 
   for (auto& future : futures) {
     future.wait();
   }
 
   for (auto& future : futures) {
     future.get();
   }
 
   // merge/reduce invalid indices
   InvalidRowGroupIndices invalid_indices;
   for (auto& thread_invalid_indices : invalid_indices_per_thread) {
     invalid_indices.merge(thread_invalid_indices);
   }
 
   for (auto& [_, reader] : row_group_reader_map) {
     reader.eraseInvalidRowGroupData(
         invalid_indices);  // removes invalid encoded data in buffers
   }
 
   // update the element count for each encoder
   for (const auto column_descriptor : schema.getLogicalColumns()) {
     auto column_id = column_descriptor->columnId;
     auto db_encoder = shared::get_from_map(chunks, column_id).getBuffer()->getEncoder();
     CHECK(static_cast<size_t>(group_reader->metadata()->num_rows()) >=
           invalid_indices.size());
     size_t updated_num_elems = db_encoder->getNumElems() +
                                group_reader->metadata()->num_rows() -
                                invalid_indices.size();
     db_encoder->setNumElems(updated_num_elems);
     if (column_descriptor->columnType.is_geometry()) {
       for (int i = 0; i < column_descriptor->columnType.get_physical_cols(); ++i) {
         auto db_encoder =
             shared::get_from_map(chunks, column_id + i + 1).getBuffer()->getEncoder();
         db_encoder->setNumElems(updated_num_elems);
       }
     }
   }
 
   return {group_reader->metadata()->num_rows() - invalid_indices.size(),
           invalid_indices.size()};
 }

Here is the call graph for this function:

std::list< RowGroupMetadata > foreign_storage::LazyParquetChunkLoader::metadataScan	(	const std::vector< std::string > &	file_paths,
		const ForeignTableSchema &	schema,
		const bool	do_metadata_stats_validation = `true`
	)

Perform a metadata scan for the paths specified.

Parameters

file_paths	- (ordered) files of the metadata scan
schema	- schema of the foreign table to perform metadata scan for
do_metadata_stats_validation	- validate stats in metadata of parquet files if true

Returns: a list of the row group metadata extracted from file_paths

Definition at line 2514 of file LazyParquetChunkLoader.cpp.

Referenced by foreign_storage::ParquetDataWrapper::getRowGroupMetadataForFilePaths().

                                              {
   auto timer = DEBUG_TIMER(__func__);
   auto column_interval =
       Interval<ColumnType>{schema.getLogicalAndPhysicalColumns().front()->columnId,
                            schema.getLogicalAndPhysicalColumns().back()->columnId};
   CHECK(!file_paths.empty());
 
   // The encoder map needs to be populated before we can start scanning rowgroups, so we
   // peel the first file_path out of the async loop below to perform population.
   const auto& first_path = *file_paths.begin();
   auto first_reader = file_reader_cache_->insert(first_path, file_system_);
   auto max_row_group_stats =
       validate_parquet_metadata(first_reader->parquet_reader()->metadata(),
                                 first_path,
                                 schema,
                                 do_metadata_stats_validation);
 
   // Iterate asynchronously over any paths beyond the first.
   auto table_ptr = schema.getForeignTable();
   CHECK(table_ptr);
   auto num_threads = foreign_storage::get_num_threads(*table_ptr);
   VLOG(1) << "Metadata scan using " << num_threads << " threads";
 
   const bool geo_validate_geometry =
       foreign_table_->getOptionAsBool(ForeignTable::GEO_VALIDATE_GEOMETRY_KEY);
   auto encoder_map = populate_encoder_map_for_metadata_scan(column_interval,
                                                             schema,
                                                             first_reader,
                                                             do_metadata_stats_validation,
                                                             geo_validate_geometry);
   const auto num_row_groups = get_parquet_table_size(first_reader).first;
   VLOG(1) << "Starting metadata scan of path " << first_path;
   auto row_group_metadata = metadata_scan_rowgroup_interval(
       encoder_map, {first_path, 0, num_row_groups - 1}, first_reader, schema);
   VLOG(1) << "Completed metadata scan of path " << first_path;
 
   // We want each (filepath->FileReader) pair in the cache to be initialized before we
   // multithread so that we are not adding keys in a concurrent environment, so we add
   // cache entries for each path and initialize to an empty unique_ptr if the file has not
   // yet been opened.
   // Since we have already performed the first iteration, we skip it in the thread groups
   // so as not to process it twice.
   std::vector<std::string> cache_subset;
   for (auto path_it = ++(file_paths.begin()); path_it != file_paths.end(); ++path_it) {
     file_reader_cache_->initializeIfEmpty(*path_it);
     cache_subset.emplace_back(*path_it);
   }
 
   auto paths_per_thread = partition_for_threads(cache_subset, num_threads);
   std::vector<std::future<std::pair<std::list<RowGroupMetadata>, MaxRowGroupSizeStats>>>
       futures;
   for (const auto& path_group : paths_per_thread) {
     futures.emplace_back(std::async(
         std::launch::async,
         [&](const auto& paths, const auto& file_reader_cache)
             -> std::pair<std::list<RowGroupMetadata>, MaxRowGroupSizeStats> {
           Timer<> summary_timer;
           Timer<> get_or_insert_reader_timer_ms;
           Timer<> validation_timer_ms;
           Timer<> metadata_scan_timer;
 
           summary_timer.start();
 
           std::list<RowGroupMetadata> reduced_metadata;
           MaxRowGroupSizeStats max_row_group_stats{0, 0};
           for (const auto& path : paths.get()) {
             get_or_insert_reader_timer_ms.start();
             auto reader = file_reader_cache.get().getOrInsert(path, file_system_);
             get_or_insert_reader_timer_ms.stop();
 
             validation_timer_ms.start();
             validate_equal_schema(first_reader, reader, first_path, path);
             auto local_max_row_group_stats =
                 validate_parquet_metadata(reader->parquet_reader()->metadata(),
                                           path,
                                           schema,
                                           do_metadata_stats_validation);
             if (local_max_row_group_stats.max_row_group_size >
                 max_row_group_stats.max_row_group_size) {
               max_row_group_stats = local_max_row_group_stats;
             }
             validation_timer_ms.stop();
 
             VLOG(1) << "Starting metadata scan of path " << path;
 
             metadata_scan_timer.start();
             const auto num_row_groups = get_parquet_table_size(reader).first;
             const auto interval = RowGroupInterval{path, 0, num_row_groups - 1};
             reduced_metadata.splice(
                 reduced_metadata.end(),
                 metadata_scan_rowgroup_interval(encoder_map, interval, reader, schema));
             metadata_scan_timer.stop();
 
             VLOG(1) << "Completed metadata scan of path " << path;
           }
 
           summary_timer.stop();
 
           VLOG(1) << "Runtime summary:";
           VLOG(1) << " Parquet metadata scan total time: " << summary_timer.elapsed()
                   << "ms";
           VLOG(1) << " Parquet file reader opening time: "
                   << get_or_insert_reader_timer_ms.elapsed() << "ms";
           VLOG(1) << " Parquet metadata validation time: "
                   << validation_timer_ms.elapsed() << "ms";
           VLOG(1) << " Parquet metadata processing time: "
                   << validation_timer_ms.elapsed() << "ms";
 
           return {reduced_metadata, max_row_group_stats};
         },
         std::ref(path_group),
         std::ref(*file_reader_cache_)));
   }
 
   // Reduce all the row_group results.
   for (auto& future : futures) {
     auto [metadata, local_max_row_group_stats] = future.get();
     row_group_metadata.splice(row_group_metadata.end(), metadata);
     if (local_max_row_group_stats.max_row_group_size >
         max_row_group_stats.max_row_group_size) {
       max_row_group_stats = local_max_row_group_stats;
     }
   }
 
   if (max_row_group_stats.max_row_group_size > schema.getForeignTable()->maxFragRows) {
     throw_row_group_larger_than_fragment_size_error(
         max_row_group_stats, schema.getForeignTable()->maxFragRows);
   }
 
   return row_group_metadata;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

DataPreview foreign_storage::LazyParquetChunkLoader::previewFiles	(	const std::vector< std::string > &	files,
		const size_t	max_num_rows,
		const ForeignTable &	table
	)

Preview rows of data and column types in a set of files.

Parameters

files	- files to preview
max_num_rows	- maximum number of rows to preview
table	- foreign table for preview

Returns: a DataPreview instance that contains relevant preview information

Definition at line 2355 of file LazyParquetChunkLoader.cpp.

References appendRowGroups(), CHECK, CHECK_EQ, CHECK_GE, foreign_storage::PreviewContext::column_chunks, foreign_storage::PreviewContext::column_descriptors, foreign_storage::DataPreview::column_names, foreign_storage::DataPreview::column_types, ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, foreign_storage::create_futures_for_workers(), foreign_storage::PreviewContext::detect_buffers, foreign_storage::detect_geo_type(), file_reader_cache_, file_system_, foreign_storage::get_num_threads(), foreign_storage::FileReaderMap::getOrInsert(), gpu_enabled::iota(), ColumnDescriptor::isSystemCol, ColumnDescriptor::isVirtualCol, kENCODING_NONE, foreign_storage::DataPreview::num_rejected_rows, foreign_storage::PreviewContext::rejected_row_indices_per_column, foreign_storage::DataPreview::sample_rows, suggestColumnMapping(), ColumnDescriptor::tableId, and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

                                                                                     {
   CHECK(!files.empty());
 
   auto first_file = *files.begin();
   auto first_file_reader = file_reader_cache_->getOrInsert(*files.begin(), file_system_);
 
   for (auto current_file_it = ++files.begin(); current_file_it != files.end();
        ++current_file_it) {
     auto file_reader = file_reader_cache_->getOrInsert(*current_file_it, file_system_);
     validate_equal_schema(first_file_reader, file_reader, first_file, *current_file_it);
   }
 
   auto first_file_metadata = first_file_reader->parquet_reader()->metadata();
   auto num_columns = first_file_metadata->num_columns();
 
   DataPreview data_preview;
   data_preview.num_rejected_rows = 0;
 
   auto current_file_it = files.begin();
   while (data_preview.sample_rows.size() < max_num_rows &&
          current_file_it != files.end()) {
     size_t total_num_rows = data_preview.sample_rows.size();
     size_t max_num_rows_to_append = max_num_rows - data_preview.sample_rows.size();
 
     // gather enough rows in row groups to produce required samples
     std::vector<RowGroupInterval> row_group_intervals;
     for (; current_file_it != files.end(); ++current_file_it) {
       const auto& file_path = *current_file_it;
       auto file_reader = file_reader_cache_->getOrInsert(file_path, file_system_);
       auto file_metadata = file_reader->parquet_reader()->metadata();
       auto num_row_groups = file_metadata->num_row_groups();
       int end_row_group = 0;
       for (int i = 0; i < num_row_groups && total_num_rows < max_num_rows; ++i) {
         const size_t next_num_rows = file_metadata->RowGroup(i)->num_rows();
         total_num_rows += next_num_rows;
         end_row_group = i;
       }
       row_group_intervals.push_back(RowGroupInterval{file_path, 0, end_row_group});
     }
 
     PreviewContext preview_context;
     for (int i = 0; i < num_columns; ++i) {
       auto col = first_file_metadata->schema()->Column(i);
       ColumnDescriptor& cd = preview_context.column_descriptors.emplace_back();
       auto sql_type = LazyParquetChunkLoader::suggestColumnMapping(col);
       cd.columnType = sql_type;
       cd.columnName =
           sql_type.is_array() ? col->path()->ToDotVector()[0] + "_array" : col->name();
       cd.isSystemCol = false;
       cd.isVirtualCol = false;
       cd.tableId = -1;
       cd.columnId = i + 1;
       data_preview.column_names.emplace_back(cd.columnName);
       data_preview.column_types.emplace_back(sql_type);
       preview_context.detect_buffers.push_back(
           std::make_unique<TypedParquetDetectBuffer>());
       preview_context.rejected_row_indices_per_column.push_back(
           std::make_unique<RejectedRowIndices>());
       auto& detect_buffer = preview_context.detect_buffers.back();
       auto& chunk = preview_context.column_chunks.emplace_back(&cd);
       chunk.setPinnable(false);
       chunk.setBuffer(detect_buffer.get());
     }
 
     std::function<void(const std::vector<int>&)> append_row_groups_for_column =
         [&](const std::vector<int>& column_indices) {
           for (const auto& column_index : column_indices) {
             auto& chunk = preview_context.column_chunks[column_index];
             auto chunk_list = std::list<Chunk_NS::Chunk>{chunk};
             auto& rejected_row_indices =
                 preview_context.rejected_row_indices_per_column[column_index];
             appendRowGroups(row_group_intervals,
                             column_index,
                             chunk.getColumnDesc(),
                             chunk_list,
                             nullptr,
                             rejected_row_indices.get(),
                             true,
                             max_num_rows_to_append);
           }
         };
 
     auto num_threads = foreign_storage::get_num_threads(foreign_table);
 
     std::vector<int> columns(num_columns);
     std::iota(columns.begin(), columns.end(), 0);
     auto futures =
         create_futures_for_workers(columns, num_threads, append_row_groups_for_column);
     for (auto& future : futures) {
       future.wait();
     }
     for (auto& future : futures) {
       future.get();
     }
 
     // merge all `rejected_row_indices_per_column`
     auto rejected_row_indices = std::make_unique<RejectedRowIndices>();
     for (int i = 0; i < num_columns; ++i) {
       rejected_row_indices->insert(
           preview_context.rejected_row_indices_per_column[i]->begin(),
           preview_context.rejected_row_indices_per_column[i]->end());
     }
 
     size_t num_rows = 0;
     auto buffers_it = preview_context.detect_buffers.begin();
     for (int i = 0; i < num_columns; ++i, ++buffers_it) {
       CHECK(buffers_it != preview_context.detect_buffers.end());
       auto& strings = buffers_it->get()->getStrings();
       if (i == 0) {
         num_rows = strings.size();
       } else {
         CHECK_EQ(num_rows, strings.size());
       }
     }
 
     size_t num_rejected_rows = rejected_row_indices->size();
     data_preview.num_rejected_rows += num_rejected_rows;
     CHECK_GE(num_rows, num_rejected_rows);
     auto row_count = num_rows - num_rejected_rows;
 
     auto offset_row = data_preview.sample_rows.size();
     data_preview.sample_rows.resize(std::min(offset_row + row_count, max_num_rows));
 
     for (size_t irow = 0, rows_appended = 0;
          irow < num_rows && offset_row + rows_appended < max_num_rows;
          ++irow) {
       if (rejected_row_indices->find(irow) != rejected_row_indices->end()) {
         continue;
       }
       auto& row_data = data_preview.sample_rows[offset_row + rows_appended];
       row_data.resize(num_columns);
       auto buffers_it = preview_context.detect_buffers.begin();
       for (int i = 0; i < num_columns; ++i, ++buffers_it) {
         CHECK(buffers_it != preview_context.detect_buffers.end());
         auto& strings = buffers_it->get()->getStrings();
         row_data[i] = strings[irow];
       }
       ++rows_appended;
     }
   }
 
   // attempt to detect geo columns
   for (int i = 0; i < num_columns; ++i) {
     auto type_info = data_preview.column_types[i];
     if (type_info.is_string()) {
       auto tentative_geo_type =
           foreign_storage::detect_geo_type(data_preview.sample_rows, i);
       if (tentative_geo_type.has_value()) {
         data_preview.column_types[i].set_type(tentative_geo_type.value());
         data_preview.column_types[i].set_compression(kENCODING_NONE);
       }
     }
   }
 
   return data_preview;
 }

Here is the call graph for this function:

SQLTypeInfo foreign_storage::LazyParquetChunkLoader::suggestColumnMapping ( const parquet::ColumnDescriptor * parquet_column )

staticprivate

Suggest a possible Parquet to OmniSci column mapping based on heuristics.

Parameters

parquet_column - the column descriptor of the Parquet column

Returns: a supported OmniSci SQLTypeInfo given the Parquet column type

NOTE: the suggested type may be entirely inappropriate given a specific use-case; however, it is guaranteed to be an allowed mapping. For example, geo-types are never attempted to be detected and instead strings are always suggested in their place.

Definition at line 2036 of file LazyParquetChunkLoader.cpp.

References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_list_column(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_column_scalar_type(), and run_benchmark_import::type.

Referenced by previewFiles().

                                                    {
   auto type = suggest_column_scalar_type(parquet_column);
 
   // array case
   if (is_valid_parquet_list_column(parquet_column)) {
     return type.get_array_type();
   }
 
   return type;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

Member Data Documentation

const int foreign_storage::LazyParquetChunkLoader::batch_reader_num_elements = 4096

static

Definition at line 42 of file LazyParquetChunkLoader.h.

Referenced by appendRowGroups(), foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::resize_values_buffer().

FileReaderMap* foreign_storage::LazyParquetChunkLoader::file_reader_cache_

private

Definition at line 171 of file LazyParquetChunkLoader.h.

Referenced by appendRowGroups(), metadataScan(), and previewFiles().

std::shared_ptr<arrow::fs::FileSystem> foreign_storage::LazyParquetChunkLoader::file_system_

private

Definition at line 170 of file LazyParquetChunkLoader.h.

Referenced by appendRowGroups(), loadRowGroups(), metadataScan(), and previewFiles().

const ForeignTable* foreign_storage::LazyParquetChunkLoader::foreign_table_

private

Definition at line 173 of file LazyParquetChunkLoader.h.

Referenced by appendRowGroups(), LazyParquetChunkLoader(), loadRowGroups(), and metadataScan().

The documentation for this class was generated from the following files:

/home/jenkins-slave/workspace/core-os-doxygen/DataMgr/ForeignStorage/LazyParquetChunkLoader.h
/home/jenkins-slave/workspace/core-os-doxygen/DataMgr/ForeignStorage/LazyParquetChunkLoader.cpp

Public Member Functions

Static Public Member Functions

Static Public Attributes

Private Member Functions

Static Private Member Functions

Private Attributes

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation

Member Data Documentation