OmniSciDB  471d68cefb
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::TextFileBufferParser Class Referenceabstract

#include <TextFileBufferParser.h>

+ Inheritance diagram for foreign_storage::TextFileBufferParser:

Public Member Functions

virtual ParseBufferResult parseBuffer (ParseBufferRequest &request, bool convert_data_blocks, bool columns_are_pre_filtered=false) const =0
 
virtual import_export::CopyParams validateAndGetCopyParams (const ForeignTable *foreign_table) const =0
 
virtual size_t findRowEndPosition (size_t &alloc_size, std::unique_ptr< char[]> &buffer, size_t &buffer_size, const import_export::CopyParams &copy_params, const size_t buffer_first_row_index, unsigned int &num_rows_in_buffer, FileReader *file_reader) const =0
 
virtual void validateFiles (const FileReader *file_reader, const ForeignTable *foreign_table) const =0
 

Static Public Member Functions

static std::map< int,
DataBlockPtr
convertImportBuffersToDataBlocks (const std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers)
 
static bool isCoordinateScalar (const std::string_view datum)
 
static void processGeoColumn (std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, size_t &col_idx, const import_export::CopyParams &copy_params, std::list< const ColumnDescriptor * >::iterator &cd_it, std::vector< std::string_view > &row, size_t &import_idx, bool is_null, size_t first_row_index, size_t row_index_plus_one, std::shared_ptr< Catalog_Namespace::Catalog > catalog)
 
static bool isNullDatum (const std::string_view datum, const ColumnDescriptor *column, const std::string &null_indicator)
 

Detailed Description

Definition at line 84 of file TextFileBufferParser.h.

Member Function Documentation

std::map< int, DataBlockPtr > foreign_storage::TextFileBufferParser::convertImportBuffersToDataBlocks ( const std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &  import_buffers)
static

Definition at line 56 of file TextFileBufferParser.cpp.

References DataBlockPtr::arraysPtr, threading_serial::async(), CHECK, CHECK_EQ, IS_STRING, kARRAY, kBOOLEAN, kENCODING_DICT, kENCODING_NONE, DataBlockPtr::numbersPtr, run_benchmark_import::result, and DataBlockPtr::stringsPtr.

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), foreign_storage::RegexFileBufferParser::parseBuffer(), and foreign_storage::InternalSystemDataWrapper::populateChunkBuffers().

58  {
59  std::map<int, DataBlockPtr> result;
60  std::vector<std::pair<const size_t, std::future<int8_t*>>>
61  encoded_data_block_ptrs_futures;
62  // make all async calls to string dictionary here and then continue execution
63  for (const auto& import_buffer : import_buffers) {
64  if (import_buffer == nullptr) {
65  continue;
66  }
67  DataBlockPtr p;
68  if (import_buffer->getTypeInfo().is_number() ||
69  import_buffer->getTypeInfo().is_time() ||
70  import_buffer->getTypeInfo().get_type() == kBOOLEAN) {
71  p.numbersPtr = import_buffer->getAsBytes();
72  } else if (import_buffer->getTypeInfo().is_string()) {
73  auto string_payload_ptr = import_buffer->getStringBuffer();
74  if (import_buffer->getTypeInfo().get_compression() == kENCODING_NONE) {
75  p.stringsPtr = string_payload_ptr;
76  } else {
77  CHECK_EQ(kENCODING_DICT, import_buffer->getTypeInfo().get_compression());
78  p.numbersPtr = nullptr;
79 
80  auto column_id = import_buffer->getColumnDesc()->columnId;
81  encoded_data_block_ptrs_futures.emplace_back(std::make_pair(
82  column_id,
83  std::async(std::launch::async, [&import_buffer, string_payload_ptr] {
84  import_buffer->addDictEncodedString(*string_payload_ptr);
85  return import_buffer->getStringDictBuffer();
86  })));
87  }
88  } else if (import_buffer->getTypeInfo().is_geometry()) {
89  auto geo_payload_ptr = import_buffer->getGeoStringBuffer();
90  p.stringsPtr = geo_payload_ptr;
91  } else {
92  CHECK(import_buffer->getTypeInfo().get_type() == kARRAY);
93  if (IS_STRING(import_buffer->getTypeInfo().get_subtype())) {
94  CHECK(import_buffer->getTypeInfo().get_compression() == kENCODING_DICT);
95  import_buffer->addDictEncodedStringArray(*import_buffer->getStringArrayBuffer());
96  p.arraysPtr = import_buffer->getStringArrayDictBuffer();
97  } else {
98  p.arraysPtr = import_buffer->getArrayBuffer();
99  }
100  }
101  result[import_buffer->getColumnDesc()->columnId] = p;
102  }
103 
104  // wait for the async requests we made for string dictionary
105  for (auto& encoded_ptr_future : encoded_data_block_ptrs_futures) {
106  result[encoded_ptr_future.first].numbersPtr = encoded_ptr_future.second.get();
107  }
108  return result;
109 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:227
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:228
future< Result > async(Fn &&fn, Args &&...args)
#define IS_STRING(T)
Definition: sqltypes.h:250
#define CHECK(condition)
Definition: Logger.h:209
int8_t * numbersPtr
Definition: sqltypes.h:226

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual size_t foreign_storage::TextFileBufferParser::findRowEndPosition ( size_t &  alloc_size,
std::unique_ptr< char[]> &  buffer,
size_t &  buffer_size,
const import_export::CopyParams copy_params,
const size_t  buffer_first_row_index,
unsigned int &  num_rows_in_buffer,
FileReader file_reader 
) const
pure virtual

Finds and returns the offset of the end of the last row in the given buffer. If the buffer does not contain at least one row, the buffer is extended with more content from the file until a row is read. An exception is thrown if the buffer is extended to a maximum threshold and at least one row has still not been read.

Implemented in foreign_storage::RegexFileBufferParser, and foreign_storage::CsvFileBufferParser.

Referenced by foreign_storage::dispatch_metadata_scan_requests().

+ Here is the caller graph for this function:

bool foreign_storage::TextFileBufferParser::isCoordinateScalar ( const std::string_view  datum)
static

Definition at line 111 of file TextFileBufferParser.cpp.

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), processGeoColumn(), and foreign_storage::anonymous_namespace{TextFileBufferParser.cpp}::set_coordinates_from_separate_lon_lat_columns().

111  {
112  // field looks like a scalar numeric value (and not a hex blob)
113  return datum.size() > 0 && (datum[0] == '.' || isdigit(datum[0]) || datum[0] == '-') &&
114  datum.find_first_of("ABCDEFabcdef") == std::string_view::npos;
115 }

+ Here is the caller graph for this function:

bool foreign_storage::TextFileBufferParser::isNullDatum ( const std::string_view  datum,
const ColumnDescriptor column,
const std::string &  null_indicator 
)
static

Definition at line 235 of file TextFileBufferParser.cpp.

References ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_notnull(), is_null(), and SQLTypeInfo::is_string().

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), and foreign_storage::RegexFileBufferParser::parseBuffer().

237  {
238  bool is_null = (datum == null_indicator);
239 
240  // Treating empty as NULL
241  if (!column->columnType.is_string() && datum.empty()) {
242  is_null = true;
243  }
244 
245  if (is_null && column->columnType.get_notnull()) {
246  throw std::runtime_error("NULL value provided for column (" + column->columnName +
247  ") with NOT NULL constraint.");
248  }
249  return is_null;
250 }
CONSTEXPR DEVICE bool is_null(const T &value)
SQLTypeInfo columnType
bool is_string() const
Definition: sqltypes.h:509
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:336
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual ParseBufferResult foreign_storage::TextFileBufferParser::parseBuffer ( ParseBufferRequest request,
bool  convert_data_blocks,
bool  columns_are_pre_filtered = false 
) const
pure virtual

Parses a given file buffer and returns data blocks for each column in the file along with metadata related to rows and row offsets within the buffer.

Parameters
convert_data_blocks- convert import buffers to data blocks
columns_are_pre_filtered- file buffer passed into parse_buffer only has the necessary columns that are being requested, not all columns.

Implemented in foreign_storage::RegexFileBufferParser, and foreign_storage::CsvFileBufferParser.

Referenced by foreign_storage::parse_file_regions(), and foreign_storage::scan_metadata().

+ Here is the caller graph for this function:

void foreign_storage::TextFileBufferParser::processGeoColumn ( std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &  import_buffers,
size_t &  col_idx,
const import_export::CopyParams copy_params,
std::list< const ColumnDescriptor * >::iterator &  cd_it,
std::vector< std::string_view > &  row,
size_t &  import_idx,
bool  is_null,
size_t  first_row_index,
size_t  row_index_plus_one,
std::shared_ptr< Catalog_Namespace::Catalog catalog 
)
static

Definition at line 151 of file TextFileBufferParser.cpp.

References CHECK, Geospatial::GeoTypesFactory::getGeoColumns(), Geospatial::GeoTypesFactory::getNullGeoColumns(), IS_GEO, isCoordinateScalar(), kMULTIPOLYGON, kPOINT, kPOLYGON, import_export::CopyParams::lonlat, import_export::CopyParams::null_str, foreign_storage::anonymous_namespace{TextFileBufferParser.cpp}::PROMOTE_POLYGON_TO_MULTIPOLYGON, foreign_storage::anonymous_namespace{TextFileBufferParser.cpp}::set_coordinates_from_separate_lon_lat_columns(), import_export::Importer::set_geo_physical_import_buffer(), and to_string().

Referenced by foreign_storage::CsvFileBufferParser::parseBuffer(), and foreign_storage::RegexFileBufferParser::parseBuffer().

161  {
162  auto cd = *cd_it;
163  auto col_ti = cd->columnType;
164  SQLTypes col_type = col_ti.get_type();
165  CHECK(IS_GEO(col_type));
166 
167  // store null string in the base column
168  import_buffers[col_idx]->add_value(cd, copy_params.null_str, true, copy_params);
169 
170  auto const& geo_string = row[import_idx];
171  ++import_idx;
172  ++col_idx;
173 
174  std::vector<double> coords;
175  std::vector<double> bounds;
176  std::vector<int> ring_sizes;
177  std::vector<int> poly_rings;
178  int render_group = 0;
179 
180  if (!is_null && col_type == kPOINT && isCoordinateScalar(geo_string)) {
182  geo_string, row[import_idx], coords, copy_params.lonlat)) {
183  throw std::runtime_error("Cannot read lon/lat to insert into POINT column " +
184  cd->columnName);
185  }
186  ++import_idx;
187  } else {
188  SQLTypeInfo import_ti{col_ti};
189  if (is_null) {
191  coords,
192  bounds,
193  ring_sizes,
194  poly_rings,
196  } else {
197  // extract geometry directly from WKT
198  if (!Geospatial::GeoTypesFactory::getGeoColumns(std::string(geo_string),
199  import_ti,
200  coords,
201  bounds,
202  ring_sizes,
203  poly_rings,
205  std::string msg = "Failed to extract valid geometry from row " +
206  std::to_string(first_row_index + row_index_plus_one) +
207  " for column " + cd->columnName;
208  throw std::runtime_error(msg);
209  }
210 
211  // validate types
212  if (col_type != import_ti.get_type()) {
214  !(import_ti.get_type() == SQLTypes::kPOLYGON &&
215  col_type == SQLTypes::kMULTIPOLYGON)) {
216  throw std::runtime_error("Imported geometry doesn't match the type of column " +
217  cd->columnName);
218  }
219  }
220  }
221  }
222 
223  // import extracted geo
225  cd,
226  import_buffers,
227  col_idx,
228  coords,
229  bounds,
230  ring_sizes,
231  poly_rings,
232  render_group);
233 }
SQLTypes
Definition: sqltypes.h:38
static void getNullGeoColumns(SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, const bool promote_poly_to_mpoly=false)
Definition: Types.cpp:1144
std::string to_string(char const *&&v)
CONSTEXPR DEVICE bool is_null(const T &value)
static void set_geo_physical_import_buffer(const Catalog_Namespace::Catalog &catalog, const ColumnDescriptor *cd, std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, size_t &col_idx, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, int render_group)
Definition: Importer.cpp:1630
static bool getGeoColumns(const std::string &wkt_or_wkb_hex, SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, const bool promote_poly_to_mpoly=false)
Definition: Types.cpp:937
#define CHECK(condition)
Definition: Logger.h:209
bool set_coordinates_from_separate_lon_lat_columns(const std::string_view lon_str, const std::string_view lat_str, std::vector< double > &coords, const bool is_lon_lat_order)
static bool isCoordinateScalar(const std::string_view datum)
#define IS_GEO(T)
Definition: sqltypes.h:251

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

virtual import_export::CopyParams foreign_storage::TextFileBufferParser::validateAndGetCopyParams ( const ForeignTable foreign_table) const
pure virtual

Validates foreign table parse options and returns a CopyParams object upon successful validation. An exception is thrown if validation fails.

Implemented in foreign_storage::RegexFileBufferParser, and foreign_storage::CsvFileBufferParser.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata(), foreign_storage::AbstractTextFileDataWrapper::populateChunks(), and foreign_storage::AbstractTextFileDataWrapper::restoreDataWrapperInternals().

+ Here is the caller graph for this function:

virtual void foreign_storage::TextFileBufferParser::validateFiles ( const FileReader file_reader,
const ForeignTable foreign_table 
) const
pure virtual

Performs basic validation of files to be parsed.

Implemented in foreign_storage::RegexFileBufferParser, and foreign_storage::CsvFileBufferParser.


The documentation for this class was generated from the following files: