OmniSciDB  2e3a973ef4
LazyParquetChunkLoader.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <arrow/filesystem/filesystem.h>
20 #include <parquet/schema.h>
21 
22 #include "DataMgr/Chunk/Chunk.h"
23 #include "ImportExport/Importer.h"
24 #include "Interval.h"
25 #include "ParquetShared.h"
26 
27 namespace foreign_storage {
32  public:
33  // The number of elements in a batch that are read from the Parquet file;
34  // this number is subject to change with performance tuning.
35  // Most filesystems use a default block size of 4096 bytes.
36  const static int batch_reader_num_elements = 4096;
37 
38  LazyParquetChunkLoader(std::shared_ptr<arrow::fs::FileSystem> file_system);
39 
63  std::list<std::unique_ptr<ChunkMetadata>> loadChunk(
64  const std::vector<RowGroupInterval>& row_group_intervals,
65  const int parquet_column_index,
66  std::list<Chunk_NS::Chunk>& chunks,
67  StringDictionary* string_dictionary = nullptr);
68 
78  static bool isColumnMappingSupported(const ColumnDescriptor* omnisci_column,
79  const parquet::ColumnDescriptor* parquet_column);
80 
81  private:
82  std::shared_ptr<arrow::fs::FileSystem> file_system_;
83 };
84 
85 } // namespace foreign_storage
static bool isColumnMappingSupported(const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column)
std::list< std::unique_ptr< ChunkMetadata > > loadChunk(const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary=nullptr)
LazyParquetChunkLoader(std::shared_ptr< arrow::fs::FileSystem > file_system)
specifies the content in-memory of a row in the column metadata table
std::shared_ptr< arrow::fs::FileSystem > file_system_