OmniSciDB
a667adc9c8
|
#include <ParquetDataWrapper.h>
Private Member Functions | |
std::list< const ColumnDescriptor * > | getColumnsToInitialize (const Interval< ColumnType > &column_interval) |
void | initializeChunkBuffers (const int fragment_index, const Interval< ColumnType > &column_interval, const ChunkToBufferMap &required_buffers, const bool reserve_buffers_and_set_stats=false) |
void | fetchChunkMetadata () |
void | loadBuffersUsingLazyParquetChunkLoader (const int logical_column_id, const int fragment_id, const ChunkToBufferMap &required_buffers) |
std::set< std::string > | getProcessedFilePaths () |
std::set< std::string > | getAllFilePaths () |
bool | moveToNextFragment (size_t new_rows_count) const |
void | finalizeFragmentMap () |
void | addNewFragment (int row_group, const std::string &file_path) |
bool | isNewFile (const std::string &file_path) const |
void | addNewFile (const std::string &file_path) |
void | resetParquetMetadata () |
void | metadataScanFiles (const std::set< std::string > &file_paths) |
Private Attributes | |
std::map< int, std::vector < RowGroupInterval > > | fragment_to_row_group_interval_map_ |
std::map< ChunkKey, std::shared_ptr< ChunkMetadata > > | chunk_metadata_map_ |
const int | db_id_ |
const ForeignTable * | foreign_table_ |
int | last_fragment_index_ |
size_t | last_fragment_row_count_ |
size_t | total_row_count_ |
int | last_row_group_ |
bool | is_restored_ |
std::unique_ptr < ForeignTableSchema > | schema_ |
std::shared_ptr < arrow::fs::FileSystem > | file_system_ |
std::unique_ptr< FileReaderMap > | file_reader_cache_ |
Additional Inherited Members | |
![]() | |
enum | ParallelismLevel { NONE, INTRA_FRAGMENT, INTER_FRAGMENT } |
![]() | |
static const std::string | STORAGE_TYPE_KEY = "STORAGE_TYPE" |
static const std::string | BASE_PATH_KEY = "BASE_PATH" |
static const std::string | FILE_PATH_KEY = "FILE_PATH" |
static const std::string | LOCAL_FILE_STORAGE_TYPE = "LOCAL_FILE" |
static const std::string | S3_STORAGE_TYPE = "AWS_S3" |
static const std::array < std::string, 1 > | supported_storage_types |
![]() | |
static std::string | getFullFilePath (const ForeignTable *foreign_table) |
Returns the path to the source file/dir of the table. Depending on options this may result from a concatenation of server and table path options. More... | |
Definition at line 35 of file ParquetDataWrapper.h.
foreign_storage::ParquetDataWrapper::ParquetDataWrapper | ( | ) |
Definition at line 73 of file ParquetDataWrapper.cpp.
foreign_storage::ParquetDataWrapper::ParquetDataWrapper | ( | const int | db_id, |
const ForeignTable * | foreign_table | ||
) |
Definition at line 75 of file ParquetDataWrapper.cpp.
References file_system_, foreign_storage::ForeignTable::foreign_server, foreign_storage::AbstractFileStorageDataWrapper::LOCAL_FILE_STORAGE_TYPE, foreign_storage::OptionsContainer::options, foreign_storage::AbstractFileStorageDataWrapper::STORAGE_TYPE_KEY, and UNREACHABLE.
|
private |
Definition at line 204 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, fragment_to_row_group_interval_map_, last_fragment_index_, and last_row_group_.
Referenced by metadataScanFiles().
|
private |
Definition at line 178 of file ParquetDataWrapper.cpp.
References CHECK, fragment_to_row_group_interval_map_, last_fragment_index_, last_fragment_row_count_, and last_row_group_.
Referenced by metadataScanFiles().
|
private |
Definition at line 218 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, chunk_metadata_map_, db_id_, file_reader_cache_, file_system_, foreign_table_, getAllFilePaths(), Catalog_Namespace::SysCatalog::getCatalog(), getProcessedFilePaths(), Catalog_Namespace::SysCatalog::instance(), foreign_storage::ForeignTable::isAppendMode(), metadataScanFiles(), resetParquetMetadata(), foreign_storage::throw_removed_file_error(), foreign_storage::throw_removed_row_error(), and total_row_count_.
Referenced by populateChunkMetadata().
|
private |
Definition at line 173 of file ParquetDataWrapper.cpp.
References fragment_to_row_group_interval_map_, last_fragment_index_, and last_row_group_.
Referenced by metadataScanFiles().
|
private |
Definition at line 280 of file ParquetDataWrapper.cpp.
References DEBUG_TIMER, file_system_, foreign_table_, and foreign_storage::AbstractFileStorageDataWrapper::getFullFilePath().
Referenced by fetchChunkMetadata().
|
inlineoverridevirtual |
Gets the desired level of parallelism for the data wrapper when a cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 54 of file ParquetDataWrapper.h.
References foreign_storage::ForeignDataWrapper::INTER_FRAGMENT.
|
private |
Definition at line 104 of file ParquetDataWrapper.cpp.
References CHECK, db_id_, foreign_storage::Interval< T >::end, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), schema_, and foreign_storage::Interval< T >::start.
Referenced by initializeChunkBuffers().
|
inlineoverridevirtual |
Gets the desired level of parallelism for the data wrapper when no cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 56 of file ParquetDataWrapper.h.
References foreign_storage::ForeignDataWrapper::INTRA_FRAGMENT.
|
private |
Definition at line 270 of file ParquetDataWrapper.cpp.
References fragment_to_row_group_interval_map_.
Referenced by fetchChunkMetadata().
|
private |
Definition at line 121 of file ParquetDataWrapper.cpp.
References CHECK, chunk_metadata_map_, db_id_, foreign_table_, getColumnsToInitialize(), kENCODING_NONE, and TableDescriptor::tableId.
Referenced by loadBuffersUsingLazyParquetChunkLoader().
|
private |
Definition at line 190 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, fragment_to_row_group_interval_map_, and last_fragment_index_.
Referenced by metadataScanFiles().
|
overridevirtual |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 553 of file ParquetDataWrapper.cpp.
References is_restored_.
|
private |
Definition at line 367 of file ParquetDataWrapper.cpp.
References CHECK, chunk_metadata_map_, ColumnDescriptor::columnType, db_id_, foreign_storage::Interval< T >::end, file_reader_cache_, file_system_, foreign_table_, fragment_to_row_group_interval_map_, TableDescriptor::fragmenter, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_elem_type(), SQLTypeInfo::get_physical_cols(), Catalog_Namespace::SysCatalog::getCatalog(), initializeChunkBuffers(), Catalog_Namespace::SysCatalog::instance(), SQLTypeInfo::is_array(), SQLTypeInfo::is_dict_encoded_string(), SQLTypeInfo::is_geometry(), foreign_storage::LazyParquetChunkLoader::loadChunk(), schema_, foreign_storage::Interval< T >::start, and TableDescriptor::tableId.
Referenced by populateChunkBuffers().
|
private |
Definition at line 306 of file ParquetDataWrapper.cpp.
References addNewFile(), addNewFragment(), CHECK, CHECK_EQ, chunk_metadata_map_, db_id_, file_reader_cache_, file_system_, finalizeFragmentMap(), foreign_table_, isNewFile(), last_fragment_index_, last_fragment_row_count_, last_row_group_, foreign_storage::LazyParquetChunkLoader::metadataScan(), moveToNextFragment(), foreign_storage::anonymous_namespace{ParquetDataWrapper.cpp}::reduce_metadata(), schema_, TableDescriptor::tableId, and total_row_count_.
Referenced by fetchChunkMetadata().
|
private |
Definition at line 354 of file ParquetDataWrapper.cpp.
References foreign_table_, last_fragment_row_count_, and TableDescriptor::maxFragRows.
Referenced by metadataScanFiles().
|
overridevirtual |
Populates given chunk buffers identified by chunk keys. All provided chunk buffers are expected to be for the same fragment.
required_buffers | - chunk buffers that must always be populated |
optional_buffers | - chunk buffers that can be optionally populated, if the data wrapper has to scan through chunk data anyways (typically for row wise data formats) |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 464 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_FRAGMENT_IDX, g_max_import_threads, loadBuffersUsingLazyParquetChunkLoader(), foreign_storage::partition_for_threads(), and schema_.
|
overridevirtual |
Populates given chunk metadata vector with metadata for all chunks in related foreign table.
chunk_metadata_vector | - vector that will be populated with chunk metadata |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 359 of file ParquetDataWrapper.cpp.
References chunk_metadata_map_, and fetchChunkMetadata().
|
private |
Definition at line 93 of file ParquetDataWrapper.cpp.
References file_reader_cache_, fragment_to_row_group_interval_map_, last_fragment_index_, last_fragment_row_count_, last_row_group_, and total_row_count_.
Referenced by fetchChunkMetadata().
|
overridevirtual |
Restore internal state of datawrapper
file_path | - location of file created by serializeMetadata |
chunk_metadata_vector | - vector of chunk metadata recovered from disk |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 532 of file ParquetDataWrapper.cpp.
References CHECK, chunk_metadata_map_, test_fsi::d, fragment_to_row_group_interval_map_, foreign_storage::json_utils::get_value_from_object(), is_restored_, last_fragment_index_, last_fragment_row_count_, last_row_group_, foreign_storage::json_utils::read_from_file(), and total_row_count_.
|
overridevirtual |
Serialize internal state of wrapper into file at given path if implemented
file_path | - location to save file to |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 512 of file ParquetDataWrapper.cpp.
References foreign_storage::json_utils::add_value_to_object(), test_fsi::d, fragment_to_row_group_interval_map_, last_fragment_index_, last_fragment_row_count_, last_row_group_, total_row_count_, and foreign_storage::json_utils::write_to_file().
|
private |
Definition at line 89 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), initializeChunkBuffers(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), populateChunkMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 90 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getColumnsToInitialize(), initializeChunkBuffers(), loadBuffersUsingLazyParquetChunkLoader(), and metadataScanFiles().
|
private |
Definition at line 99 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), and resetParquetMetadata().
|
private |
Definition at line 98 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getAllFilePaths(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), and ParquetDataWrapper().
|
private |
Definition at line 91 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getAllFilePaths(), initializeChunkBuffers(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), and moveToNextFragment().
|
private |
Definition at line 88 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), getProcessedFilePaths(), isNewFile(), loadBuffersUsingLazyParquetChunkLoader(), resetParquetMetadata(), restoreDataWrapperInternals(), and serializeDataWrapperInternals().
|
private |
Definition at line 96 of file ParquetDataWrapper.h.
Referenced by isRestored(), and restoreDataWrapperInternals().
|
private |
Definition at line 92 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), isNewFile(), metadataScanFiles(), resetParquetMetadata(), restoreDataWrapperInternals(), and serializeDataWrapperInternals().
|
private |
Definition at line 93 of file ParquetDataWrapper.h.
Referenced by addNewFragment(), metadataScanFiles(), moveToNextFragment(), resetParquetMetadata(), restoreDataWrapperInternals(), and serializeDataWrapperInternals().
|
private |
Definition at line 95 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), metadataScanFiles(), resetParquetMetadata(), restoreDataWrapperInternals(), and serializeDataWrapperInternals().
|
private |
Definition at line 97 of file ParquetDataWrapper.h.
Referenced by getColumnsToInitialize(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), and populateChunkBuffers().
|
private |
Definition at line 94 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), metadataScanFiles(), resetParquetMetadata(), restoreDataWrapperInternals(), and serializeDataWrapperInternals().