OmniSciDB
085a039ca4
|
#include <ParquetDataWrapper.h>
Public Member Functions | |
ParquetDataWrapper () | |
ParquetDataWrapper (const int db_id, const ForeignTable *foreign_table, const bool do_metadata_stats_validation=true) | |
ParquetDataWrapper (const ForeignTable *foreign_table, std::shared_ptr< arrow::fs::FileSystem > file_system) | |
Constructor intended for detect use-case only. More... | |
void | populateChunkMetadata (ChunkMetadataVector &chunk_metadata_vector) override |
void | populateChunkBuffers (const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers, AbstractBuffer *delete_buffer) override |
std::string | getSerializedDataWrapper () const override |
void | restoreDataWrapperInternals (const std::string &file_path, const ChunkMetadataVector &chunk_metadata_vector) override |
bool | isRestored () const override |
ParallelismLevel | getCachedParallelismLevel () const override |
ParallelismLevel | getNonCachedParallelismLevel () const override |
void | createRenderGroupAnalyzers () override |
Create RenderGroupAnalyzers for poly columns. More... | |
DataPreview | getDataPreview (const size_t num_rows) |
![]() | |
AbstractFileStorageDataWrapper () | |
void | validateServerOptions (const ForeignServer *foreign_server) const override |
void | validateTableOptions (const ForeignTable *foreign_table) const override |
const std::set < std::string_view > & | getSupportedTableOptions () const override |
void | validateUserMappingOptions (const UserMapping *user_mapping, const ForeignServer *foreign_server) const override |
const std::set < std::string_view > & | getSupportedUserMappingOptions () const override |
![]() | |
ForeignDataWrapper ()=default | |
virtual | ~ForeignDataWrapper ()=default |
virtual const std::set < std::string > | getAlterableTableOptions () const |
virtual void | validateSchema (const std::list< ColumnDescriptor > &columns) const |
Private Member Functions | |
std::list< const ColumnDescriptor * > | getColumnsToInitialize (const Interval< ColumnType > &column_interval) |
void | initializeChunkBuffers (const int fragment_index, const Interval< ColumnType > &column_interval, const ChunkToBufferMap &required_buffers, const bool reserve_buffers_and_set_stats=false) |
void | fetchChunkMetadata () |
void | loadBuffersUsingLazyParquetChunkLoader (const int logical_column_id, const int fragment_id, const ChunkToBufferMap &required_buffers, AbstractBuffer *delete_buffer) |
std::set< std::string > | getProcessedFilePaths () |
std::vector< std::string > | getAllFilePaths () |
bool | moveToNextFragment (size_t new_rows_count) const |
void | finalizeFragmentMap () |
void | addNewFragment (int row_group, const std::string &file_path) |
bool | isNewFile (const std::string &file_path) const |
void | addNewFile (const std::string &file_path) |
void | resetParquetMetadata () |
void | metadataScanFiles (const std::vector< std::string > &file_paths) |
Private Attributes | |
const bool | do_metadata_stats_validation_ |
std::map< int, std::vector < RowGroupInterval > > | fragment_to_row_group_interval_map_ |
std::map< ChunkKey, std::shared_ptr< ChunkMetadata > > | chunk_metadata_map_ |
const int | db_id_ |
const ForeignTable * | foreign_table_ |
int | last_fragment_index_ |
size_t | last_fragment_row_count_ |
size_t | total_row_count_ |
int | last_row_group_ |
bool | is_restored_ |
std::unique_ptr < ForeignTableSchema > | schema_ |
std::shared_ptr < arrow::fs::FileSystem > | file_system_ |
std::unique_ptr< FileReaderMap > | file_reader_cache_ |
std::mutex | delete_buffer_mutex_ |
RenderGroupAnalyzerMap | render_group_analyzer_map_ |
Additional Inherited Members | |
![]() | |
enum | ParallelismLevel { NONE, INTRA_FRAGMENT, INTER_FRAGMENT } |
![]() | |
static const std::string | STORAGE_TYPE_KEY = "STORAGE_TYPE" |
static const std::string | BASE_PATH_KEY = "BASE_PATH" |
static const std::string | FILE_PATH_KEY = "FILE_PATH" |
static const std::string | REGEX_PATH_FILTER_KEY = "REGEX_PATH_FILTER" |
static const std::string | LOCAL_FILE_STORAGE_TYPE = "LOCAL_FILE" |
static const std::string | S3_STORAGE_TYPE = "AWS_S3" |
static const std::string | FILE_SORT_ORDER_BY_KEY = shared::FILE_SORT_ORDER_BY_KEY |
static const std::string | FILE_SORT_REGEX_KEY = shared::FILE_SORT_REGEX_KEY |
static const std::array < std::string, 1 > | supported_storage_types |
![]() | |
static std::string | getFullFilePath (const ForeignTable *foreign_table) |
Returns the path to the source file/dir of the table. Depending on options this may result from a concatenation of server and table path options. More... | |
Definition at line 40 of file ParquetDataWrapper.h.
foreign_storage::ParquetDataWrapper::ParquetDataWrapper | ( | ) |
Definition at line 76 of file ParquetDataWrapper.cpp.
foreign_storage::ParquetDataWrapper::ParquetDataWrapper | ( | const int | db_id, |
const ForeignTable * | foreign_table, | ||
const bool | do_metadata_stats_validation = true |
||
) |
Definition at line 92 of file ParquetDataWrapper.cpp.
References file_system_, foreign_storage::ForeignTable::foreign_server, foreign_storage::AbstractFileStorageDataWrapper::LOCAL_FILE_STORAGE_TYPE, foreign_storage::OptionsContainer::options, foreign_storage::AbstractFileStorageDataWrapper::STORAGE_TYPE_KEY, and UNREACHABLE.
foreign_storage::ParquetDataWrapper::ParquetDataWrapper | ( | const ForeignTable * | foreign_table, |
std::shared_ptr< arrow::fs::FileSystem > | file_system | ||
) |
Constructor intended for detect use-case only.
Definition at line 79 of file ParquetDataWrapper.cpp.
|
private |
Definition at line 226 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, fragment_to_row_group_interval_map_, last_fragment_index_, and last_row_group_.
Referenced by metadataScanFiles().
|
private |
Definition at line 200 of file ParquetDataWrapper.cpp.
References CHECK, fragment_to_row_group_interval_map_, last_fragment_index_, last_fragment_row_count_, and last_row_group_.
Referenced by metadataScanFiles().
|
overridevirtual |
Create RenderGroupAnalyzers for poly columns.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 623 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_GE, db_id_, foreign_table_, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), IS_GEO_POLY, render_group_analyzer_map_, and TableDescriptor::tableId.
|
private |
Definition at line 240 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, chunk_metadata_map_, shared::contains(), db_id_, file_reader_cache_, file_system_, foreign_table_, getAllFilePaths(), Catalog_Namespace::SysCatalog::getCatalog(), getProcessedFilePaths(), Catalog_Namespace::SysCatalog::instance(), foreign_storage::ForeignTable::isAppendMode(), metadataScanFiles(), resetParquetMetadata(), foreign_storage::throw_removed_file_error(), foreign_storage::throw_removed_row_in_file_error(), and total_row_count_.
Referenced by populateChunkMetadata().
|
private |
Definition at line 195 of file ParquetDataWrapper.cpp.
References fragment_to_row_group_interval_map_, last_fragment_index_, and last_row_group_.
Referenced by metadataScanFiles().
|
private |
Definition at line 302 of file ParquetDataWrapper.cpp.
References DEBUG_TIMER, foreign_storage::AbstractFileStorageDataWrapper::FILE_SORT_ORDER_BY_KEY, foreign_storage::AbstractFileStorageDataWrapper::FILE_SORT_REGEX_KEY, foreign_storage::ForeignTable::foreign_server, foreign_table_, foreign_storage::AbstractFileStorageDataWrapper::getFullFilePath(), foreign_storage::OptionsContainer::getOption(), foreign_storage::AbstractFileStorageDataWrapper::LOCAL_FILE_STORAGE_TYPE, shared::local_glob_filter_sort_files(), foreign_storage::OptionsContainer::options, foreign_storage::AbstractFileStorageDataWrapper::REGEX_PATH_FILTER_KEY, foreign_storage::AbstractFileStorageDataWrapper::STORAGE_TYPE_KEY, and UNREACHABLE.
Referenced by fetchChunkMetadata(), and getDataPreview().
|
inlineoverridevirtual |
Gets the desired level of parallelism for the data wrapper when a cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 68 of file ParquetDataWrapper.h.
References foreign_storage::ForeignDataWrapper::INTER_FRAGMENT.
|
private |
Definition at line 124 of file ParquetDataWrapper.cpp.
References CHECK, db_id_, foreign_storage::Interval< T >::end, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), schema_, and foreign_storage::Interval< T >::start.
Referenced by initializeChunkBuffers().
DataPreview foreign_storage::ParquetDataWrapper::getDataPreview | ( | const size_t | num_rows | ) |
Definition at line 609 of file ParquetDataWrapper.cpp.
References file_reader_cache_, file_system_, foreign_table_, getAllFilePaths(), foreign_storage::AbstractFileStorageDataWrapper::getFullFilePath(), and render_group_analyzer_map_.
|
inlineoverridevirtual |
Gets the desired level of parallelism for the data wrapper when no cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 70 of file ParquetDataWrapper.h.
References foreign_storage::ForeignDataWrapper::INTRA_FRAGMENT.
|
private |
Definition at line 292 of file ParquetDataWrapper.cpp.
References fragment_to_row_group_interval_map_.
Referenced by fetchChunkMetadata().
|
overridevirtual |
Serialize internal state of wrapper into file at given path if implemented
Implements foreign_storage::ForeignDataWrapper.
Definition at line 566 of file ParquetDataWrapper.cpp.
References foreign_storage::json_utils::add_value_to_object(), fragment_to_row_group_interval_map_, last_fragment_index_, last_fragment_row_count_, last_row_group_, total_row_count_, and foreign_storage::json_utils::write_to_string().
|
private |
Definition at line 141 of file ParquetDataWrapper.cpp.
References CHECK, chunk_metadata_map_, db_id_, foreign_table_, shared::get_from_map(), getColumnsToInitialize(), kENCODING_NONE, and TableDescriptor::tableId.
Referenced by loadBuffersUsingLazyParquetChunkLoader().
|
private |
Definition at line 212 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, fragment_to_row_group_interval_map_, and last_fragment_index_.
Referenced by metadataScanFiles().
|
overridevirtual |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 605 of file ParquetDataWrapper.cpp.
References is_restored_.
|
private |
Definition at line 382 of file ParquetDataWrapper.cpp.
References Data_Namespace::AbstractBuffer::append(), CHECK, CHECK_GT, chunk_metadata_map_, ColumnDescriptor::columnType, db_id_, delete_buffer_mutex_, foreign_storage::Interval< T >::end, file_reader_cache_, file_system_, foreign_table_, fragment_to_row_group_interval_map_, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_elem_type(), shared::get_from_map(), SQLTypeInfo::get_physical_cols(), Catalog_Namespace::SysCatalog::getCatalog(), Data_Namespace::AbstractBuffer::getMemoryPtr(), initializeChunkBuffers(), Catalog_Namespace::SysCatalog::instance(), SQLTypeInfo::is_array(), SQLTypeInfo::is_dict_encoded_string(), SQLTypeInfo::is_geometry(), foreign_storage::LazyParquetChunkLoader::loadChunk(), render_group_analyzer_map_, schema_, Data_Namespace::AbstractBuffer::size(), foreign_storage::Interval< T >::start, and TableDescriptor::tableId.
Referenced by populateChunkBuffers().
|
private |
Definition at line 320 of file ParquetDataWrapper.cpp.
References addNewFile(), addNewFragment(), CHECK, CHECK_EQ, chunk_metadata_map_, db_id_, do_metadata_stats_validation_, file_reader_cache_, file_system_, finalizeFragmentMap(), foreign_table_, isNewFile(), last_fragment_index_, last_fragment_row_count_, last_row_group_, foreign_storage::LazyParquetChunkLoader::metadataScan(), moveToNextFragment(), foreign_storage::anonymous_namespace{ParquetDataWrapper.cpp}::reduce_metadata(), schema_, TableDescriptor::tableId, and total_row_count_.
Referenced by fetchChunkMetadata().
|
private |
Definition at line 369 of file ParquetDataWrapper.cpp.
References foreign_table_, last_fragment_row_count_, and TableDescriptor::maxFragRows.
Referenced by metadataScanFiles().
|
overridevirtual |
Populates given chunk buffers identified by chunk keys. All provided chunk buffers are expected to be for the same fragment.
required_buffers | - chunk buffers that must always be populated |
optional_buffers | - chunk buffers that can be optionally populated, if the data wrapper has to scan through chunk data anyways (typically for row wise data formats) |
delete_buffer | - chunk buffer for fragment's delete column, if non-null data wrapper is expected to mark deleted rows in buffer and continue processing |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 513 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_FRAGMENT_IDX, foreign_storage::create_futures_for_workers(), g_max_import_threads, loadBuffersUsingLazyParquetChunkLoader(), and schema_.
|
overridevirtual |
Populates given chunk metadata vector with metadata for all chunks in related foreign table.
chunk_metadata_vector | - vector that will be populated with chunk metadata |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 374 of file ParquetDataWrapper.cpp.
References chunk_metadata_map_, and fetchChunkMetadata().
|
private |
Definition at line 113 of file ParquetDataWrapper.cpp.
References file_reader_cache_, fragment_to_row_group_interval_map_, last_fragment_index_, last_fragment_row_count_, last_row_group_, and total_row_count_.
Referenced by fetchChunkMetadata().
|
overridevirtual |
Restore internal state of datawrapper
file_path | - location of file created by serializeMetadata |
chunk_metadata_vector | - vector of chunk metadata recovered from disk |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 584 of file ParquetDataWrapper.cpp.
References CHECK, chunk_metadata_map_, fragment_to_row_group_interval_map_, foreign_storage::json_utils::get_value_from_object(), is_restored_, last_fragment_index_, last_fragment_row_count_, last_row_group_, foreign_storage::json_utils::read_from_file(), and total_row_count_.
|
private |
Definition at line 109 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), initializeChunkBuffers(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), populateChunkMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 110 of file ParquetDataWrapper.h.
Referenced by createRenderGroupAnalyzers(), fetchChunkMetadata(), getColumnsToInitialize(), initializeChunkBuffers(), loadBuffersUsingLazyParquetChunkLoader(), and metadataScanFiles().
|
private |
Definition at line 121 of file ParquetDataWrapper.h.
Referenced by loadBuffersUsingLazyParquetChunkLoader().
|
private |
Definition at line 107 of file ParquetDataWrapper.h.
Referenced by metadataScanFiles().
|
private |
Definition at line 119 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getDataPreview(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), and resetParquetMetadata().
|
private |
Definition at line 118 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getDataPreview(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), and ParquetDataWrapper().
|
private |
Definition at line 111 of file ParquetDataWrapper.h.
Referenced by createRenderGroupAnalyzers(), fetchChunkMetadata(), getAllFilePaths(), getDataPreview(), initializeChunkBuffers(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), and moveToNextFragment().
|
private |
Definition at line 108 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), getProcessedFilePaths(), getSerializedDataWrapper(), isNewFile(), loadBuffersUsingLazyParquetChunkLoader(), resetParquetMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 116 of file ParquetDataWrapper.h.
Referenced by isRestored(), and restoreDataWrapperInternals().
|
private |
Definition at line 112 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), getSerializedDataWrapper(), isNewFile(), metadataScanFiles(), resetParquetMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 113 of file ParquetDataWrapper.h.
Referenced by addNewFragment(), getSerializedDataWrapper(), metadataScanFiles(), moveToNextFragment(), resetParquetMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 115 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), getSerializedDataWrapper(), metadataScanFiles(), resetParquetMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 126 of file ParquetDataWrapper.h.
Referenced by createRenderGroupAnalyzers(), getDataPreview(), and loadBuffersUsingLazyParquetChunkLoader().
|
private |
Definition at line 117 of file ParquetDataWrapper.h.
Referenced by getColumnsToInitialize(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanFiles(), and populateChunkBuffers().
|
private |
Definition at line 114 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getSerializedDataWrapper(), metadataScanFiles(), resetParquetMetadata(), and restoreDataWrapperInternals().