OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::TextFileBufferParser Class Referenceabstract

#include <TextFileBufferParser.h>

+ Inheritance diagram for foreign_storage::TextFileBufferParser:

Public Member Functions

virtual ParseBufferResult parseBuffer (ParseBufferRequest &request, bool convert_data_blocks, bool columns_are_pre_filtered=false, bool skip_dict_encoding=false) const =0
 
virtual import_export::CopyParams validateAndGetCopyParams (const ForeignTable *foreign_table) const =0
 
virtual size_t findRowEndPosition (size_t &alloc_size, std::unique_ptr< char[]> &buffer, size_t &buffer_size, const import_export::CopyParams &copy_params, const size_t buffer_first_row_index, unsigned int &num_rows_in_buffer, FileReader *file_reader) const =0
 
virtual void validateFiles (const FileReader *file_reader, const ForeignTable *foreign_table) const =0
 

Static Public Member Functions

static std::map< int,
DataBlockPtr
convertImportBuffersToDataBlocks (const std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, const bool skip_dict_encoding=false)
 
static bool isCoordinateScalar (const std::string_view datum)
 
static void processGeoColumn (std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, size_t &col_idx, const import_export::CopyParams &copy_params, std::list< const ColumnDescriptor * >::iterator &cd_it, std::vector< std::string_view > &row, size_t &import_idx, bool is_null, size_t first_row_index, size_t row_index_plus_one, std::shared_ptr< Catalog_Namespace::Catalog > catalog, const RenderGroupAnalyzerMap *render_group_analyzer_map)
 
static void fillRejectedRowWithInvalidData (const std::list< const ColumnDescriptor * > &columns, std::list< const ColumnDescriptor * >::iterator &cd_it, const size_t col_idx, ParseBufferRequest &request)
 
static bool isNullDatum (const std::string_view datum, const ColumnDescriptor *column, const std::string &null_indicator)
 

Static Public Attributes

static const std::string BUFFER_SIZE_KEY = "BUFFER_SIZE"
 

Static Private Member Functions

static void processInvalidGeoColumn (std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, size_t &col_idx, const import_export::CopyParams &copy_params, const ColumnDescriptor *cd, std::shared_ptr< Catalog_Namespace::Catalog > catalog)
 

Detailed Description

Definition at line 97 of file TextFileBufferParser.h.

Member Function Documentation

std::map< int, DataBlockPtr > foreign_storage::TextFileBufferParser::convertImportBuffersToDataBlocks ( const std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &  import_buffers,
const bool  skip_dict_encoding = false 
)
static

Definition at line 62 of file TextFileBufferParser.cpp.

References DataBlockPtr::arraysPtr, threading_serial::async(), CHECK, CHECK_EQ, IS_STRING, kARRAY, kBOOLEAN, kENCODING_DICT, kENCODING_NONE, DataBlockPtr::numbersPtr, run_benchmark_import::result, and DataBlockPtr::stringsPtr.

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), foreign_storage::RegexFileBufferParser::parseBuffer(), and foreign_storage::InternalSystemDataWrapper::populateChunkBuffers().

64  {
65  std::map<int, DataBlockPtr> result;
66  std::vector<std::pair<const size_t, std::future<int8_t*>>>
67  encoded_data_block_ptrs_futures;
68  // make all async calls to string dictionary here and then continue execution
69  for (const auto& import_buffer : import_buffers) {
70  if (import_buffer == nullptr) {
71  continue;
72  }
73  DataBlockPtr p;
74  if (import_buffer->getTypeInfo().is_number() ||
75  import_buffer->getTypeInfo().is_time() ||
76  import_buffer->getTypeInfo().get_type() == kBOOLEAN) {
77  p.numbersPtr = import_buffer->getAsBytes();
78  } else if (import_buffer->getTypeInfo().is_string()) {
79  auto string_payload_ptr = import_buffer->getStringBuffer();
80  if (import_buffer->getTypeInfo().get_compression() == kENCODING_NONE) {
81  p.stringsPtr = string_payload_ptr;
82  } else {
83  CHECK_EQ(kENCODING_DICT, import_buffer->getTypeInfo().get_compression());
84  p.numbersPtr = nullptr;
85 
86  if (!skip_dict_encoding) {
87  auto column_id = import_buffer->getColumnDesc()->columnId;
88  encoded_data_block_ptrs_futures.emplace_back(std::make_pair(
89  column_id,
90  std::async(std::launch::async, [&import_buffer, string_payload_ptr] {
91  import_buffer->addDictEncodedString(*string_payload_ptr);
92  return import_buffer->getStringDictBuffer();
93  })));
94  }
95  }
96  } else if (import_buffer->getTypeInfo().is_geometry()) {
97  auto geo_payload_ptr = import_buffer->getGeoStringBuffer();
98  p.stringsPtr = geo_payload_ptr;
99  } else {
100  CHECK(import_buffer->getTypeInfo().get_type() == kARRAY);
101  if (IS_STRING(import_buffer->getTypeInfo().get_subtype())) {
102  CHECK(import_buffer->getTypeInfo().get_compression() == kENCODING_DICT);
103  import_buffer->addDictEncodedStringArray(*import_buffer->getStringArrayBuffer());
104  p.arraysPtr = import_buffer->getStringArrayDictBuffer();
105  } else {
106  p.arraysPtr = import_buffer->getArrayBuffer();
107  }
108  }
109  result[import_buffer->getColumnDesc()->columnId] = p;
110  }
111 
112  if (!skip_dict_encoding) {
113  // wait for the async requests we made for string dictionary
114  for (auto& encoded_ptr_future : encoded_data_block_ptrs_futures) {
115  encoded_ptr_future.second.wait();
116  }
117  for (auto& encoded_ptr_future : encoded_data_block_ptrs_futures) {
118  result[encoded_ptr_future.first].numbersPtr = encoded_ptr_future.second.get();
119  }
120  }
121  return result;
122 }
#define CHECK_EQ(x, y)
Definition: Logger.h:297
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:222
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:223
future< Result > async(Fn &&fn, Args &&...args)
#define IS_STRING(T)
Definition: sqltypes.h:297
#define CHECK(condition)
Definition: Logger.h:289
int8_t * numbersPtr
Definition: sqltypes.h:221

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::TextFileBufferParser::fillRejectedRowWithInvalidData ( const std::list< const ColumnDescriptor * > &  columns,
std::list< const ColumnDescriptor * >::iterator &  cd_it,
const size_t  col_idx,
ParseBufferRequest request 
)
static

Fill the current row of the request with invalid (null) data as row will be marked as rejected

Definition at line 175 of file TextFileBufferParser.cpp.

References foreign_storage::ParseBufferRequest::copy_params, foreign_storage::ParseBufferRequest::getCatalog(), foreign_storage::ParseBufferRequest::import_buffers, and processInvalidGeoColumn().

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), and foreign_storage::RegexFileBufferParser::parseBuffer().

179  {
180  size_t col_idx = starting_col_idx;
181 
182  for (; cd_it != columns.end(); cd_it++) {
183  auto cd = *cd_it;
184  const auto& col_ti = cd->columnType;
185  if (col_ti.is_geometry()) {
186  if (request.import_buffers[col_idx] != nullptr) {
187  processInvalidGeoColumn(request.import_buffers,
188  col_idx,
189  request.copy_params,
190  cd,
191  request.getCatalog());
192  } else {
193  ++col_idx;
194  col_idx += col_ti.get_physical_cols();
195  }
196  // skip remaining physical columns
197  for (int i = 0; i < cd->columnType.get_physical_cols(); ++i) {
198  ++cd_it;
199  }
200  } else {
201  if (request.import_buffers[col_idx] != nullptr) {
202  request.import_buffers[col_idx]->add_value(cd,
203  {},
204  true,
205  request.copy_params,
206  /*check_not_null=*/false);
207  }
208  ++col_idx;
209  }
210  }
211 }
static void processInvalidGeoColumn(std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, size_t &col_idx, const import_export::CopyParams &copy_params, const ColumnDescriptor *cd, std::shared_ptr< Catalog_Namespace::Catalog > catalog)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual size_t foreign_storage::TextFileBufferParser::findRowEndPosition ( size_t &  alloc_size,
std::unique_ptr< char[]> &  buffer,
size_t &  buffer_size,
const import_export::CopyParams copy_params,
const size_t  buffer_first_row_index,
unsigned int &  num_rows_in_buffer,
FileReader file_reader 
) const
pure virtual

Finds and returns the offset of the end of the last row in the given buffer. If the buffer does not contain at least one row, the buffer is extended with more content from the file until a row is read. An exception is thrown if the buffer is extended to a maximum threshold and at least one row has still not been read.

Implemented in foreign_storage::RegexFileBufferParser, and foreign_storage::CsvFileBufferParser.

Referenced by foreign_storage::dispatch_scan_requests().

+ Here is the caller graph for this function:

bool foreign_storage::TextFileBufferParser::isCoordinateScalar ( const std::string_view  datum)
static

Definition at line 124 of file TextFileBufferParser.cpp.

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), processGeoColumn(), and foreign_storage::anonymous_namespace{TextFileBufferParser.cpp}::set_coordinates_from_separate_lon_lat_columns().

124  {
125  // field looks like a scalar numeric value (and not a hex blob)
126  return datum.size() > 0 && (datum[0] == '.' || isdigit(datum[0]) || datum[0] == '-') &&
127  datum.find_first_of("ABCDEFabcdef") == std::string_view::npos;
128 }

+ Here is the caller graph for this function:

bool foreign_storage::TextFileBufferParser::isNullDatum ( const std::string_view  datum,
const ColumnDescriptor column,
const std::string &  null_indicator 
)
static

Definition at line 364 of file TextFileBufferParser.cpp.

References ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_notnull(), is_null(), ImportHelpers::is_null_datum(), and SQLTypeInfo::is_string().

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), and foreign_storage::RegexFileBufferParser::parseBuffer().

366  {
367  bool is_null = ImportHelpers::is_null_datum(datum, null_indicator);
368 
369  // Treating empty as NULL
370  if (!column->columnType.is_string() && datum.empty()) {
371  is_null = true;
372  }
373 
374  if (is_null && column->columnType.get_notnull()) {
375  throw std::runtime_error("NULL value provided for column (" + column->columnName +
376  ") with NOT NULL constraint.");
377  }
378  return is_null;
379 }
bool is_null_datum(const DatumStringType &datum, const std::string &null_indicator)
CONSTEXPR DEVICE bool is_null(const T &value)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:576
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:387
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual ParseBufferResult foreign_storage::TextFileBufferParser::parseBuffer ( ParseBufferRequest request,
bool  convert_data_blocks,
bool  columns_are_pre_filtered = false,
bool  skip_dict_encoding = false 
) const
pure virtual

Parses a given file buffer and returns data blocks for each column in the file along with metadata related to rows and row offsets within the buffer.

Parameters
convert_data_blocks- convert import buffers to data blocks
columns_are_pre_filtered- file buffer passed into parse_buffer only has the necessary columns that are being requested, not all columns.
skip_dict_encoding- skip dictionary encoding for encoded strings; the encoding will be required to happen later in processing

Implemented in foreign_storage::RegexFileBufferParser, and foreign_storage::CsvFileBufferParser.

Referenced by foreign_storage::parse_file_regions(), foreign_storage::populate_chunks(), and foreign_storage::scan_metadata().

+ Here is the caller graph for this function:

void foreign_storage::TextFileBufferParser::processGeoColumn ( std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &  import_buffers,
size_t &  col_idx,
const import_export::CopyParams copy_params,
std::list< const ColumnDescriptor * >::iterator &  cd_it,
std::vector< std::string_view > &  row,
size_t &  import_idx,
bool  is_null,
size_t  first_row_index,
size_t  row_index_plus_one,
std::shared_ptr< Catalog_Namespace::Catalog catalog,
const RenderGroupAnalyzerMap render_group_analyzer_map 
)
static

Definition at line 250 of file TextFileBufferParser.cpp.

References CHECK, Geospatial::GeoTypesFactory::getGeoColumns(), Geospatial::GeoTypesFactory::getNullGeoColumns(), IS_GEO, IS_GEO_POLY, isCoordinateScalar(), kMULTIPOLYGON, kPOINT, kPOLYGON, import_export::CopyParams::lonlat, import_export::CopyParams::null_str, foreign_storage::anonymous_namespace{TextFileBufferParser.cpp}::PROMOTE_POLYGON_TO_MULTIPOLYGON, foreign_storage::anonymous_namespace{TextFileBufferParser.cpp}::set_coordinates_from_separate_lon_lat_columns(), import_export::Importer::set_geo_physical_import_buffer(), import_export::CopyParams::source_srid, and to_string().

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), and foreign_storage::RegexFileBufferParser::parseBuffer().

261  {
262  auto cd = *cd_it;
263  auto col_ti = cd->columnType;
264  SQLTypes col_type = col_ti.get_type();
265  CHECK(IS_GEO(col_type));
266 
267  auto starting_col_idx = col_idx;
268 
269  auto const& geo_string = row[import_idx];
270  ++import_idx;
271  ++col_idx;
272 
273  std::vector<double> coords;
274  std::vector<double> bounds;
275  std::vector<int> ring_sizes;
276  std::vector<int> poly_rings;
277  int render_group = 0;
278 
279  // prepare to transform from another SRID
280  SQLTypeInfo import_ti{col_ti};
281  if (import_ti.get_output_srid() == 4326) {
282  auto srid0 = copy_params.source_srid;
283  if (srid0 > 0) {
284  // srid0 -> 4326 transform is requested on import
285  import_ti.set_input_srid(srid0);
286  }
287  }
288 
289  if (!is_null && col_type == kPOINT && isCoordinateScalar(geo_string)) {
291  geo_string, row[import_idx], import_ti, coords, copy_params.lonlat)) {
292  throw std::runtime_error("Cannot read lon/lat to insert into POINT column " +
293  cd->columnName);
294  }
295  ++import_idx;
296  } else {
297  if (is_null || geo_string.empty() || geo_string == "NULL") {
299  coords,
300  bounds,
301  ring_sizes,
302  poly_rings,
304  is_null = true;
305  } else {
306  // extract geometry directly from WKT
307  if (!Geospatial::GeoTypesFactory::getGeoColumns(std::string(geo_string),
308  import_ti,
309  coords,
310  bounds,
311  ring_sizes,
312  poly_rings,
314  std::string msg = "Failed to extract valid geometry from row " +
315  std::to_string(first_row_index + row_index_plus_one) +
316  " for column " + cd->columnName;
317  throw std::runtime_error(msg);
318  }
319 
320  // validate types
321  if (col_type != import_ti.get_type()) {
323  !(import_ti.get_type() == SQLTypes::kPOLYGON &&
324  col_type == SQLTypes::kMULTIPOLYGON)) {
325  throw std::runtime_error("Imported geometry doesn't match the type of column " +
326  cd->columnName);
327  }
328  }
329 
330  // get render group
331  if (IS_GEO_POLY(col_type) && render_group_analyzer_map &&
332  render_group_analyzer_map->size()) {
333  auto const itr = render_group_analyzer_map->find(cd->columnId);
334  if (itr != render_group_analyzer_map->end()) {
335  auto& render_group_analyzer = *itr->second;
336  render_group = render_group_analyzer.insertBoundsAndReturnRenderGroup(bounds);
337  }
338  }
339  }
340  }
341 
342  // allowed to be null?
343  if (is_null && col_ti.get_notnull()) {
344  throw std::runtime_error("NULL value provided for column (" + cd->columnName +
345  ") with NOT NULL constraint.");
346  }
347 
348  // import extracted geo
350  cd,
351  import_buffers,
352  col_idx,
353  coords,
354  bounds,
355  ring_sizes,
356  poly_rings,
357  render_group);
358 
359  // store null string in the base column
360  import_buffers[starting_col_idx]->add_value(
361  cd, copy_params.null_str, true, copy_params);
362 }
SQLTypes
Definition: sqltypes.h:53
static void getNullGeoColumns(SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, const bool promote_poly_to_mpoly=false)
Definition: Types.cpp:1309
std::string to_string(char const *&&v)
bool set_coordinates_from_separate_lon_lat_columns(const std::string_view lon_str, const std::string_view lat_str, SQLTypeInfo &ti, std::vector< double > &coords, const bool is_lon_lat_order)
CONSTEXPR DEVICE bool is_null(const T &value)
static bool getGeoColumns(const std::string &wkt_or_wkb_hex, SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, const bool promote_poly_to_mpoly=false)
Definition: Types.cpp:1079
#define CHECK(condition)
Definition: Logger.h:289
static void set_geo_physical_import_buffer(const Catalog_Namespace::Catalog &catalog, const ColumnDescriptor *cd, std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, size_t &col_idx, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, int render_group, const bool force_null=false)
Definition: Importer.cpp:1627
static bool isCoordinateScalar(const std::string_view datum)
#define IS_GEO(T)
Definition: sqltypes.h:298
#define IS_GEO_POLY(T)
Definition: sqltypes.h:303

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::TextFileBufferParser::processInvalidGeoColumn ( std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &  import_buffers,
size_t &  col_idx,
const import_export::CopyParams copy_params,
const ColumnDescriptor cd,
std::shared_ptr< Catalog_Namespace::Catalog catalog 
)
staticprivate

Definition at line 213 of file TextFileBufferParser.cpp.

References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::get_type(), Geospatial::GeoTypesFactory::getNullGeoColumns(), IS_GEO, import_export::CopyParams::null_str, foreign_storage::anonymous_namespace{TextFileBufferParser.cpp}::PROMOTE_POLYGON_TO_MULTIPOLYGON, and import_export::Importer::set_geo_physical_import_buffer().

Referenced by fillRejectedRowWithInvalidData().

218  {
219  auto col_ti = cd->columnType;
220  SQLTypes col_type = col_ti.get_type();
221  CHECK(IS_GEO(col_type));
222 
223  // store null string in the base column
224  import_buffers[col_idx]->add_value(cd, copy_params.null_str, true, copy_params);
225  ++col_idx;
226 
227  std::vector<double> coords;
228  std::vector<double> bounds;
229  std::vector<int> ring_sizes;
230  std::vector<int> poly_rings;
231  int render_group = 0;
232 
233  SQLTypeInfo import_ti{col_ti};
235  import_ti, coords, bounds, ring_sizes, poly_rings, PROMOTE_POLYGON_TO_MULTIPOLYGON);
236 
237  // import extracted geo
239  cd,
240  import_buffers,
241  col_idx,
242  coords,
243  bounds,
244  ring_sizes,
245  poly_rings,
246  render_group,
247  /*force_null=*/true);
248 }
SQLTypes
Definition: sqltypes.h:53
static void getNullGeoColumns(SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, const bool promote_poly_to_mpoly=false)
Definition: Types.cpp:1309
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:380
#define CHECK(condition)
Definition: Logger.h:289
static void set_geo_physical_import_buffer(const Catalog_Namespace::Catalog &catalog, const ColumnDescriptor *cd, std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, size_t &col_idx, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, int render_group, const bool force_null=false)
Definition: Importer.cpp:1627
SQLTypeInfo columnType
#define IS_GEO(T)
Definition: sqltypes.h:298

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual import_export::CopyParams foreign_storage::TextFileBufferParser::validateAndGetCopyParams ( const ForeignTable foreign_table) const
pure virtual

Validates foreign table parse options and returns a CopyParams object upon successful validation. An exception is thrown if validation fails.

Implemented in foreign_storage::RegexFileBufferParser, and foreign_storage::CsvFileBufferParser.

Referenced by foreign_storage::AbstractTextFileDataWrapper::iterativeFileScan(), foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata(), foreign_storage::AbstractTextFileDataWrapper::populateChunks(), and foreign_storage::AbstractTextFileDataWrapper::restoreDataWrapperInternals().

+ Here is the caller graph for this function:

virtual void foreign_storage::TextFileBufferParser::validateFiles ( const FileReader file_reader,
const ForeignTable foreign_table 
) const
pure virtual

Performs basic validation of files to be parsed.

Implemented in foreign_storage::CsvFileBufferParser, and foreign_storage::RegexFileBufferParser.

Referenced by foreign_storage::anonymous_namespace{AbstractTextFileDataWrapper.cpp}::initialize_non_append_mode_scan().

+ Here is the caller graph for this function:

Member Data Documentation

const std::string foreign_storage::TextFileBufferParser::BUFFER_SIZE_KEY = "BUFFER_SIZE"
inlinestatic

The documentation for this class was generated from the following files: