OmniSciDB  c0231cc57d
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
LazyParquetChunkLoader.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <arrow/filesystem/filesystem.h>
20 #include <parquet/schema.h>
21 
22 #include "DataMgr/Chunk/Chunk.h"
23 #include "DataPreview.h"
24 #include "ForeignTableSchema.h"
25 #include "Interval.h"
26 #include "ParquetEncoder.h"
27 #include "ParquetShared.h"
29 
30 extern size_t g_max_import_threads;
31 
32 namespace foreign_storage {
33 
38  public:
39  // The number of elements in a batch that are read from the Parquet file;
40  // this number is subject to change with performance tuning.
41  // Most filesystems use a default block size of 4096 bytes.
42  const static int batch_reader_num_elements = 4096;
43 
44  LazyParquetChunkLoader(std::shared_ptr<arrow::fs::FileSystem> file_system,
45  FileReaderMap* file_reader_cache,
46  const RenderGroupAnalyzerMap* render_group_analyzer_map,
47  const std::string& foreign_table_name);
48 
72  std::list<std::unique_ptr<ChunkMetadata>> loadChunk(
73  const std::vector<RowGroupInterval>& row_group_intervals,
74  const int parquet_column_index,
75  std::list<Chunk_NS::Chunk>& chunks,
76  StringDictionary* string_dictionary = nullptr,
77  RejectedRowIndices* rejected_row_indices = nullptr);
78 
89  std::list<RowGroupMetadata> metadataScan(
90  const std::vector<std::string>& file_paths,
91  const ForeignTableSchema& schema,
92  const bool do_metadata_stats_validation = true);
93 
103  static bool isColumnMappingSupported(const ColumnDescriptor* omnisci_column,
104  const parquet::ColumnDescriptor* parquet_column);
105 
124  std::pair<size_t, size_t> loadRowGroups(
125  const RowGroupInterval& row_group_interval,
126  const std::map<int, Chunk_NS::Chunk>& chunks,
127  const ForeignTableSchema& schema,
128  const std::map<int, StringDictionary*>& column_dictionaries,
129  const int num_threads = 1);
130 
141  DataPreview previewFiles(const std::vector<std::string>& files,
142  const size_t max_num_rows,
143  const ForeignTable& table);
144 
145  private:
159  const parquet::ColumnDescriptor* parquet_column);
160 
161  std::list<std::unique_ptr<ChunkMetadata>> appendRowGroups(
162  const std::vector<RowGroupInterval>& row_group_intervals,
163  const int parquet_column_index,
164  const ColumnDescriptor* column_descriptor,
165  std::list<Chunk_NS::Chunk>& chunks,
166  StringDictionary* string_dictionary,
167  RejectedRowIndices* rejected_row_indices,
168  const bool is_for_detect = false,
169  const std::optional<int64_t> max_levels_read = std::nullopt);
170 
171  std::shared_ptr<arrow::fs::FileSystem> file_system_;
173 
175  std::string foreign_table_name_;
176 };
177 } // namespace foreign_storage
static bool isColumnMappingSupported(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
const RenderGroupAnalyzerMap * render_group_analyzer_map_
std::list< std::unique_ptr< ChunkMetadata > > loadChunk(const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary=nullptr, RejectedRowIndices *rejected_row_indices=nullptr)
LazyParquetChunkLoader(std::shared_ptr< arrow::fs::FileSystem > file_system, FileReaderMap *file_reader_cache, const RenderGroupAnalyzerMap *render_group_analyzer_map, const std::string &foreign_table_name)
specifies the content in-memory of a row in the column metadata table
std::pair< size_t, size_t > loadRowGroups(const RowGroupInterval &row_group_interval, const std::map< int, Chunk_NS::Chunk > &chunks, const ForeignTableSchema &schema, const std::map< int, StringDictionary * > &column_dictionaries, const int num_threads=1)
Load row groups of data into given chunks.
std::list< RowGroupMetadata > metadataScan(const std::vector< std::string > &file_paths, const ForeignTableSchema &schema, const bool do_metadata_stats_validation=true)
Perform a metadata scan for the paths specified.
std::set< int64_t > RejectedRowIndices
DataPreview previewFiles(const std::vector< std::string > &files, const size_t max_num_rows, const ForeignTable &table)
Preview rows of data and column types in a set of files.
std::list< std::unique_ptr< ChunkMetadata > > appendRowGroups(const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, const ColumnDescriptor *column_descriptor, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, RejectedRowIndices *rejected_row_indices, const bool is_for_detect=false, const std::optional< int64_t > max_levels_read=std::nullopt)
std::shared_ptr< arrow::fs::FileSystem > file_system_
std::map< int, std::unique_ptr< import_export::RenderGroupAnalyzer >> RenderGroupAnalyzerMap
size_t g_max_import_threads
Definition: Importer.cpp:106
static SQLTypeInfo suggestColumnMapping(const parquet::ColumnDescriptor *parquet_column)