OmniSciDB  bf83d84833
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
CsvDataWrapper.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <map>
20 #include <vector>
21 
22 #include "Catalog/Catalog.h"
23 #include "Catalog/ForeignTable.h"
24 #include "DataMgr/Chunk/Chunk.h"
26 #include "ForeignDataWrapper.h"
27 #include "ImportExport/Importer.h"
28 
29 namespace foreign_storage {
30 
35 struct FileRegion {
36  // Name of file containing region
37  std::string filename;
38  // Byte offset (within file) for the beginning of file region
40  // Index of first row in file region relative to the first row/non-header line in the
41  // file
43  // Number of rows in file region
44  size_t row_count;
45  // Size of file region in bytes
46  size_t region_size;
47 
48  FileRegion(std::string name,
49  size_t first_row_offset,
50  size_t first_row_idx,
51  size_t row_cnt,
52  size_t region_sz)
53  : filename(name)
54  , first_row_file_offset(first_row_offset)
55  , first_row_index(first_row_idx)
56  , row_count(row_cnt)
57  , region_size(region_sz) {}
59  bool operator<(const FileRegion& other) const {
61  }
62 };
63 
64 using FileRegions = std::vector<FileRegion>;
65 
67  public:
68  CsvDataWrapper(const int db_id, const ForeignTable* foreign_table);
69 
70  void populateChunkMetadata(ChunkMetadataVector& chunk_metadata_vector) override;
71 
73  std::map<ChunkKey, AbstractBuffer*>& required_buffers,
74  std::map<ChunkKey, AbstractBuffer*>& optional_buffers) override;
75 
76  static void validateOptions(const ForeignTable* foreign_table);
77 
78  static std::vector<std::string_view> getSupportedOptions();
79 
80  void serializeDataWrapperInternals(const std::string& file_path) const override;
81 
82  void restoreDataWrapperInternals(const std::string& file_path,
83  const ChunkMetadataVector& chunk_metadata) override;
84  bool isRestored() const override;
85 
86  private:
87  CsvDataWrapper(const ForeignTable* foreign_table);
88 
96  void populateChunks(std::map<int, Chunk_NS::Chunk>& column_id_to_chunk_map,
97  int fragment_id);
98 
99  std::string getFilePath();
101  void validateFilePath();
102 
112  std::string validateAndGetStringWithLength(const std::string& option_name,
113  const size_t expected_num_chars);
114 
124  std::optional<bool> validateAndGetBoolValue(const std::string& option_name);
125 
126  void populateChunkMapForColumns(const std::set<const ColumnDescriptor*>& columns,
127  const int fragment_id,
128  const std::map<ChunkKey, AbstractBuffer*>& buffers,
129  std::map<int, Chunk_NS::Chunk>& column_id_to_chunk_map);
130 
131  void updateMetadata(std::map<int, Chunk_NS::Chunk>& column_id_to_chunk_map,
132  int fragment_id);
133 
134  std::map<ChunkKey, std::shared_ptr<ChunkMetadata>> chunk_metadata_map_;
135  std::map<int, FileRegions> fragment_id_to_file_regions_map_;
136 
137  std::unique_ptr<CsvReader> csv_reader_;
138 
139  const int db_id_;
141  std::mutex file_access_mutex_;
142 
143  // Data needed for append workflow
144  std::map<ChunkKey, std::unique_ptr<ForeignStorageBuffer>> chunk_encoder_buffers_;
145  std::map<ChunkKey, size_t> chunk_byte_count_;
146  // How many rows have been read
147  size_t num_rows_;
148  // What byte offset we left off at in the csv_reader
150  // Is this datawrapper restored from disk
152  static constexpr std::array<char const*, 11> supported_options_{"ARRAY_DELIMITER",
153  "ARRAY_MARKER",
154  "BUFFER_SIZE",
155  "DELIMITER",
156  "ESCAPE",
157  "HEADER",
158  "LINE_DELIMITER",
159  "LONLAT",
160  "NULLS",
161  "QUOTE",
162  "QUOTED"};
163 };
164 } // namespace foreign_storage
bool isRestored() const override
void populateChunkBuffers(std::map< ChunkKey, AbstractBuffer * > &required_buffers, std::map< ChunkKey, AbstractBuffer * > &optional_buffers) override
void serializeDataWrapperInternals(const std::string &file_path) const override
std::map< ChunkKey, size_t > chunk_byte_count_
void restoreDataWrapperInternals(const std::string &file_path, const ChunkMetadataVector &chunk_metadata) override
std::map< ChunkKey, std::unique_ptr< ForeignStorageBuffer > > chunk_encoder_buffers_
bool operator<(const FileRegion &other) const
static std::vector< std::string_view > getSupportedOptions()
static void validateOptions(const ForeignTable *foreign_table)
FileRegion(std::string name, size_t first_row_offset, size_t first_row_idx, size_t row_cnt, size_t region_sz)
std::unique_ptr< CsvReader > csv_reader_
static constexpr std::array< char const *, 11 > supported_options_
This file contains the class specification and related data structures for Catalog.
std::vector< FileRegion > FileRegions
std::string validateAndGetStringWithLength(const std::string &option_name, const size_t expected_num_chars)
void populateChunkMetadata(ChunkMetadataVector &chunk_metadata_vector) override
std::map< int, FileRegions > fragment_id_to_file_regions_map_
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
void updateMetadata(std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map, int fragment_id)
void populateChunkMapForColumns(const std::set< const ColumnDescriptor * > &columns, const int fragment_id, const std::map< ChunkKey, AbstractBuffer * > &buffers, std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map)
CsvDataWrapper(const int db_id, const ForeignTable *foreign_table)
std::optional< bool > validateAndGetBoolValue(const std::string &option_name)
std::map< ChunkKey, std::shared_ptr< ChunkMetadata > > chunk_metadata_map_
const ForeignTable * foreign_table_
void populateChunks(std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map, int fragment_id)
string name
Definition: setup.py:35
import_export::CopyParams validateAndGetCopyParams()