OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TextFileBufferParser.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
21 
22 #include "ImportExport/Importer.h"
24 
25 namespace foreign_storage {
26 
28  ParseBufferRequest(const ParseBufferRequest& request) = delete;
29  ParseBufferRequest(ParseBufferRequest&& request) = default;
32  int db_id,
33  const ForeignTable* foreign_table,
34  const std::set<int> column_filter_set,
35  const std::string& full_path,
37  const bool track_rejected_rows = false);
38 
39  inline std::shared_ptr<Catalog_Namespace::Catalog> getCatalog() const {
41  CHECK(catalog);
42  return catalog;
43  }
44 
45  inline std::list<const ColumnDescriptor*> getColumns() const {
46  return foreign_table_schema->getLogicalAndPhysicalColumns();
47  }
48 
49  inline int32_t getTableId() const {
50  return foreign_table_schema->getForeignTable()->tableId;
51  }
52 
53  inline std::string getTableName() const {
54  return foreign_table_schema->getForeignTable()->tableName;
55  }
56 
57  inline size_t getMaxFragRows() const {
58  return foreign_table_schema->getForeignTable()->maxFragRows;
59  }
60 
61  inline std::string getFilePath() const { return full_path; }
62 
63  // These must be initialized at construction (before parsing).
64  std::unique_ptr<char[]> buffer;
65  size_t buffer_size;
68  const int db_id;
69  std::unique_ptr<ForeignTableSchema> foreign_table_schema;
70  std::vector<std::unique_ptr<import_export::TypedImportBuffer>> import_buffers;
72 
73  // These are set during parsing.
75  size_t begin_pos;
76  size_t end_pos;
78  size_t file_offset;
80  std::string full_path;
81 
82  // This parameter controls the behaviour of error handling in the data wrapper
83  const bool track_rejected_rows;
84 
85  // This tracks the number of rows processed, is necessary to identify requests that are
86  // not completed
88 };
89 
91  std::map<int, DataBlockPtr> column_id_to_data_blocks_map;
92  size_t row_count;
93  std::vector<size_t> row_offsets;
94  std::set<size_t> rejected_rows;
95 };
96 
98  public:
109  bool convert_data_blocks,
110  bool columns_are_pre_filtered = false,
111  bool skip_dict_encoding = false) const = 0;
117  const ForeignTable* foreign_table) const = 0;
118 
126  virtual size_t findRowEndPosition(size_t& alloc_size,
127  std::unique_ptr<char[]>& buffer,
128  size_t& buffer_size,
129  const import_export::CopyParams& copy_params,
130  const size_t buffer_first_row_index,
131  unsigned int& num_rows_in_buffer,
132  FileReader* file_reader) const = 0;
133 
137  virtual void validateFiles(const FileReader* file_reader,
138  const ForeignTable* foreign_table) const = 0;
139 
140  static std::map<int, DataBlockPtr> convertImportBuffersToDataBlocks(
141  const std::vector<std::unique_ptr<import_export::TypedImportBuffer>>&
142  import_buffers,
143  const bool skip_dict_encoding = false);
144 
145  static bool isCoordinateScalar(const std::string_view datum);
146 
147  static void processGeoColumn(
148  std::vector<std::unique_ptr<import_export::TypedImportBuffer>>& import_buffers,
149  size_t& col_idx,
150  const import_export::CopyParams& copy_params,
151  std::list<const ColumnDescriptor*>::iterator& cd_it,
152  std::vector<std::string_view>& row,
153  size_t& import_idx,
154  bool is_null,
155  size_t first_row_index,
156  size_t row_index_plus_one,
157  std::shared_ptr<Catalog_Namespace::Catalog> catalog,
158  const RenderGroupAnalyzerMap* render_group_analyzer_map);
159 
164  static void fillRejectedRowWithInvalidData(
165  const std::list<const ColumnDescriptor*>& columns,
166  std::list<const ColumnDescriptor*>::iterator& cd_it,
167  const size_t col_idx,
168  ParseBufferRequest& request);
169 
170  static bool isNullDatum(const std::string_view datum,
171  const ColumnDescriptor* column,
172  const std::string& null_indicator);
173 
174  inline static const std::string BUFFER_SIZE_KEY = "BUFFER_SIZE";
175 
176  private:
177  static void processInvalidGeoColumn(
178  std::vector<std::unique_ptr<import_export::TypedImportBuffer>>& import_buffers,
179  size_t& col_idx,
180  const import_export::CopyParams& copy_params,
181  const ColumnDescriptor* cd,
182  std::shared_ptr<Catalog_Namespace::Catalog> catalog);
183 };
184 } // namespace foreign_storage
std::vector< std::unique_ptr< import_export::TypedImportBuffer > > import_buffers
virtual void validateFiles(const FileReader *file_reader, const ForeignTable *foreign_table) const =0
ParseBufferRequest(const ParseBufferRequest &request)=delete
static std::map< int, DataBlockPtr > convertImportBuffersToDataBlocks(const std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, const bool skip_dict_encoding=false)
std::map< int, DataBlockPtr > column_id_to_data_blocks_map
virtual ParseBufferResult parseBuffer(ParseBufferRequest &request, bool convert_data_blocks, bool columns_are_pre_filtered=false, bool skip_dict_encoding=false) const =0
const import_export::CopyParams copy_params
static void fillRejectedRowWithInvalidData(const std::list< const ColumnDescriptor * > &columns, std::list< const ColumnDescriptor * >::iterator &cd_it, const size_t col_idx, ParseBufferRequest &request)
std::unique_ptr< ForeignTableSchema > foreign_table_schema
static SysCatalog & instance()
Definition: SysCatalog.h:343
CONSTEXPR DEVICE bool is_null(const T &value)
specifies the content in-memory of a row in the column metadata table
std::list< const ColumnDescriptor * > getColumns() const
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
std::shared_ptr< Catalog_Namespace::Catalog > getCatalog() const
virtual size_t findRowEndPosition(size_t &alloc_size, std::unique_ptr< char[]> &buffer, size_t &buffer_size, const import_export::CopyParams &copy_params, const size_t buffer_first_row_index, unsigned int &num_rows_in_buffer, FileReader *file_reader) const =0
static void processInvalidGeoColumn(std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, size_t &col_idx, const import_export::CopyParams &copy_params, const ColumnDescriptor *cd, std::shared_ptr< Catalog_Namespace::Catalog > catalog)
#define CHECK(condition)
Definition: Logger.h:291
virtual import_export::CopyParams validateAndGetCopyParams(const ForeignTable *foreign_table) const =0
const RenderGroupAnalyzerMap * render_group_analyzer_map
static void processGeoColumn(std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, size_t &col_idx, const import_export::CopyParams &copy_params, std::list< const ColumnDescriptor * >::iterator &cd_it, std::vector< std::string_view > &row, size_t &import_idx, bool is_null, size_t first_row_index, size_t row_index_plus_one, std::shared_ptr< Catalog_Namespace::Catalog > catalog, const RenderGroupAnalyzerMap *render_group_analyzer_map)
static bool isNullDatum(const std::string_view datum, const ColumnDescriptor *column, const std::string &null_indicator)
static bool isCoordinateScalar(const std::string_view datum)
std::map< int, std::unique_ptr< import_export::RenderGroupAnalyzer >> RenderGroupAnalyzerMap