OmniSciDB  b24e664e58
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Importer_NS::DelimitedParserUtils Class Reference

#include <DelimitedParserUtils.h>

Static Public Member Functions

static size_t find_beginning (const char *buffer, size_t begin, size_t end, const CopyParams &copy_params)
 Finds the closest possible row beginning in the given buffer. More...
 
static size_t find_end (const char *buffer, size_t size, const CopyParams &copy_params, unsigned int &num_rows_this_buffer)
 Finds the closest possible row ending to the end of the given buffer. More...
 
static const char * get_row (const char *buf, const char *buf_end, const char *entire_buf_end, const Importer_NS::CopyParams &copy_params, const bool *is_array, std::vector< std::string > &row, bool &try_single_thread)
 Parses the first row in the given buffer and inserts fields into given vector. More...
 
static void parseStringArray (const std::string &s, const Importer_NS::CopyParams &copy_params, std::vector< std::string > &string_vec)
 Parses given string array and inserts into given vector of strings. More...
 

Detailed Description

Definition at line 31 of file DelimitedParserUtils.h.

Member Function Documentation

size_t Importer_NS::DelimitedParserUtils::find_beginning ( const char *  buffer,
size_t  begin,
size_t  end,
const CopyParams copy_params 
)
static

Finds the closest possible row beginning in the given buffer.

Parameters
bufferGiven buffer which has the rows in csv format. (NOT OWN)
beginStart index of buffer to look for the beginning.
endEnd index of buffer to look for the beginning.
copy_paramsCopy params for the table.
Returns
The position of the closest possible row beginning to the start of the given buffer.

Definition at line 58 of file DelimitedParserUtils.cpp.

References Importer_NS::CopyParams::line_delim.

Referenced by Importer_NS::import_thread_delimited().

61  {
62  // @TODO(wei) line_delim is in quotes note supported
63  if (begin == 0 || (begin > 0 && buffer[begin - 1] == copy_params.line_delim)) {
64  return 0;
65  }
66  size_t i;
67  const char* buf = buffer + begin;
68  for (i = 0; i < end - begin; i++) {
69  if (buf[i] == copy_params.line_delim) {
70  return i + 1;
71  }
72  }
73  return i;
74 }

+ Here is the caller graph for this function:

size_t Importer_NS::DelimitedParserUtils::find_end ( const char *  buffer,
size_t  size,
const CopyParams copy_params,
unsigned int &  num_rows_this_buffer 
)
static

Finds the closest possible row ending to the end of the given buffer.

Parameters
bufferGiven buffer which has the rows in csv format. (NOT OWN)
sizeSize of the buffer.
copy_paramsCopy params for the table.
num_rows_this_bufferNumber of rows until the closest possible row ending.
Returns
The position of the closest possible row ending to the end of the given buffer.

Definition at line 76 of file DelimitedParserUtils.cpp.

References logger::ERROR, Importer_NS::CopyParams::escape, Importer_NS::CopyParams::line_delim, LOG, Importer_NS::CopyParams::quote, and Importer_NS::CopyParams::quoted.

Referenced by Importer_NS::Importer::importDelimited().

79  {
80  size_t last_line_delim_pos = 0;
81  if (copy_params.quoted) {
82  const char* current = buffer;
83  bool in_quote = false;
84 
85  while (current < buffer + size) {
86  while (!in_quote && current < buffer + size) {
87  // We are outside of quotes. We have to find the last possible line delimiter.
88  if (*current == copy_params.line_delim) {
89  last_line_delim_pos = current - buffer;
90  ++num_rows_this_buffer;
91  } else if (*current == copy_params.quote) {
92  in_quote = true;
93  }
94  ++current;
95  }
96 
97  while (in_quote && current < buffer + size) {
98  // We are in a quoted field. We have to find the ending quote.
99  if ((*current == copy_params.escape) && (current < buffer + size - 1) &&
100  (*(current + 1) == copy_params.quote)) {
101  ++current;
102  } else if (*current == copy_params.quote) {
103  in_quote = false;
104  }
105  ++current;
106  }
107  }
108  } else {
109  const char* current = buffer;
110  while (current < buffer + size) {
111  if (*current == copy_params.line_delim) {
112  last_line_delim_pos = current - buffer;
113  ++num_rows_this_buffer;
114  }
115  ++current;
116  }
117  }
118 
119  if (last_line_delim_pos <= 0) {
120  size_t slen = size < 50 ? size : 50;
121  std::string showMsgStr(buffer, buffer + slen);
122  LOG(ERROR) << "No line delimiter in block. Block was of size " << size
123  << " bytes, first few characters " << showMsgStr;
124  return size;
125  }
126 
127  return last_line_delim_pos + 1;
128 }
#define LOG(tag)
Definition: Logger.h:185

+ Here is the caller graph for this function:

const char * Importer_NS::DelimitedParserUtils::get_row ( const char *  buf,
const char *  buf_end,
const char *  entire_buf_end,
const Importer_NS::CopyParams copy_params,
const bool *  is_array,
std::vector< std::string > &  row,
bool &  try_single_thread 
)
static

Parses the first row in the given buffer and inserts fields into given vector.

Parameters
bufGiven buffer which has the rows in csv format. (NOT OWN)
buf_endEnd of the sliced buffer for the thread. (NOT OWN)
entire_buf_endEnd of the entire buffer. (NOT OWN)
copy_paramsCopy params for the table.
is_arrayArray of bools which tells if a column is an array type.
rowGiven vector to be populated with parsed fields.
try_single_threadIn case of parse errors, this will tell if parsing should continue with single thread.
Returns
Pointer to the next row after the first row is parsed.

Definition at line 130 of file DelimitedParserUtils.cpp.

References Importer_NS::CopyParams::array_begin, Importer_NS::CopyParams::array_end, Importer_NS::CopyParams::delimiter, logger::ERROR, Importer_NS::CopyParams::escape, field(), anonymous_namespace{DelimitedParserUtils.cpp}::is_eol(), LOG, Importer_NS::CopyParams::quote, Importer_NS::CopyParams::quoted, anonymous_namespace{DelimitedParserUtils.cpp}::trim_quotes(), and Importer_NS::trim_space().

Referenced by Importer_NS::import_thread_delimited(), parseStringArray(), and Importer_NS::Detector::split_raw_data().

136  {
137  const char* field = buf;
138  const char* p;
139  bool in_quote = false;
140  bool in_array = false;
141  bool has_escape = false;
142  bool strip_quotes = false;
143  try_single_thread = false;
144  for (p = buf; p < entire_buf_end; ++p) {
145  if (*p == copy_params.escape && p < entire_buf_end - 1 &&
146  *(p + 1) == copy_params.quote) {
147  p++;
148  has_escape = true;
149  } else if (copy_params.quoted && *p == copy_params.quote) {
150  in_quote = !in_quote;
151  if (in_quote) {
152  strip_quotes = true;
153  }
154  } else if (!in_quote && is_array != nullptr && *p == copy_params.array_begin &&
155  is_array[row.size()]) {
156  in_array = true;
157  while (p < entire_buf_end - 1) { // Array type will be parsed separately.
158  ++p;
159  if (*p == copy_params.array_end) {
160  in_array = false;
161  break;
162  }
163  }
164  } else if (*p == copy_params.delimiter || is_eol(*p, copy_params)) {
165  if (!in_quote) {
166  if (!has_escape && !strip_quotes) {
167  const char* field_end = p;
168  trim_space(field, field_end);
169  row.emplace_back(field, field_end - field);
170  } else {
171  auto field_buf = std::make_unique<char[]>(p - field + 1);
172  int j = 0, i = 0;
173  for (; i < p - field; i++, j++) {
174  if (has_escape && field[i] == copy_params.escape &&
175  field[i + 1] == copy_params.quote) {
176  field_buf[j] = copy_params.quote;
177  i++;
178  } else {
179  field_buf[j] = field[i];
180  }
181  }
182  const char* field_begin = field_buf.get();
183  const char* field_end = field_buf.get() + j;
184  trim_space(field_begin, field_end);
185  trim_quotes(field_begin, field_end, copy_params);
186  row.emplace_back(field_begin, field_end - field_begin);
187  }
188  field = p + 1;
189  has_escape = false;
190  strip_quotes = false;
191 
192  if (is_eol(*p, copy_params)) {
193  // We are at the end of the row. Skip the line endings now.
194  while (p + 1 < buf_end && is_eol(*(p + 1), copy_params)) {
195  p++;
196  }
197  break;
198  }
199  }
200  }
201  }
202  /*
203  @TODO(wei) do error handling
204  */
205  if (in_quote) {
206  LOG(ERROR) << "Unmatched quote.";
207  try_single_thread = true;
208  }
209  if (in_array) {
210  LOG(ERROR) << "Unmatched array.";
211  try_single_thread = true;
212  }
213  return p;
214 }
void trim_quotes(const char *&field_begin, const char *&field_end, const Importer_NS::CopyParams &copy_params)
#define LOG(tag)
Definition: Logger.h:185
static const std::string trim_space(const char *field, const size_t len)
Definition: Importer.cpp:220
const rapidjson::Value & field(const rapidjson::Value &obj, const char field[]) noexcept
Definition: JsonAccessors.h:31
bool is_eol(const char &c, const Importer_NS::CopyParams &copy_params)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void Importer_NS::DelimitedParserUtils::parseStringArray ( const std::string &  s,
const Importer_NS::CopyParams copy_params,
std::vector< std::string > &  string_vec 
)
static

Parses given string array and inserts into given vector of strings.

Parameters
sGiven string array
copy_paramsCopy params for the table.
string_vecGiven vector to be populated with parsed fields.

Definition at line 216 of file DelimitedParserUtils.cpp.

References Importer_NS::CopyParams::array_begin, Importer_NS::CopyParams::array_delim, Importer_NS::CopyParams::array_end, Importer_NS::CopyParams::delimiter, get_row(), StringDictionary::MAX_STRLEN, Importer_NS::CopyParams::null_str, and to_string().

Referenced by Importer_NS::TypedImportBuffer::add_value(), and RowToColumnLoader::convert_string_to_column().

218  {
219  if (s == copy_params.null_str || s == "NULL" || s.size() < 1 || s.empty()) {
220  // TODO: should not convert NULL, empty arrays to {"NULL"},
221  // need to support NULL, empty properly
222  string_vec.emplace_back("NULL");
223  return;
224  }
225  if (s[0] != copy_params.array_begin || s[s.size() - 1] != copy_params.array_end) {
226  throw std::runtime_error("Malformed Array :" + s);
227  }
228 
229  std::string row(s.c_str() + 1, s.length() - 2);
230  row.push_back('\n');
231  bool try_single_thread = false;
232  Importer_NS::CopyParams array_params = copy_params;
233  array_params.delimiter = copy_params.array_delim;
234  get_row(row.c_str(),
235  row.c_str() + row.length(),
236  row.c_str() + row.length(),
237  array_params,
238  nullptr,
239  string_vec,
240  try_single_thread);
241 
242  for (size_t i = 0; i < string_vec.size(); ++i) {
243  if (string_vec[i].empty()) { // Disallow empty strings for now
244  string_vec.erase(string_vec.begin() + i);
245  --i;
246  } else if (string_vec[i].size() > StringDictionary::MAX_STRLEN) {
247  throw std::runtime_error("Array String too long : " + string_vec[i] + " max is " +
249  }
250  }
251 }
std::string null_str
Definition: CopyParams.h:47
static const char * get_row(const char *buf, const char *buf_end, const char *entire_buf_end, const Importer_NS::CopyParams &copy_params, const bool *is_array, std::vector< std::string > &row, bool &try_single_thread)
Parses the first row in the given buffer and inserts fields into given vector.
std::string to_string(char const *&&v)
static constexpr size_t MAX_STRLEN

+ Here is the call graph for this function:

+ Here is the caller graph for this function:


The documentation for this class was generated from the following files: