OmniSciDB  72180abbfe
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
DelimitedParserUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * @file DelimitedParserUtils.cpp
19  * @author Mehmet Sariyuce <mehmet.sariyuce@omnisci.com>
20  * @brief Implementation of delimited parser utils.
21  */
22 
24 
25 #include <string_view>
26 
27 #include "Shared/Logger.h"
29 
30 namespace {
31 inline bool is_eol(const char& c, const import_export::CopyParams& copy_params) {
32  return c == copy_params.line_delim || c == '\n' || c == '\r';
33 }
34 
35 inline void trim_space(const char*& field_begin, const char*& field_end) {
36  while (field_begin < field_end && (*field_begin == ' ' || *field_begin == '\r')) {
37  ++field_begin;
38  }
39  while (field_begin < field_end &&
40  (*(field_end - 1) == ' ' || *(field_end - 1) == '\r')) {
41  --field_end;
42  }
43 }
44 
45 inline void trim_quotes(const char*& field_begin,
46  const char*& field_end,
47  const import_export::CopyParams& copy_params) {
48  if (copy_params.quoted && field_end - field_begin > 0 &&
49  *field_begin == copy_params.quote) {
50  ++field_begin;
51  }
52  if (copy_params.quoted && field_end - field_begin > 0 &&
53  *(field_end - 1) == copy_params.quote) {
54  --field_end;
55  }
56 }
57 } // namespace
58 
59 namespace import_export {
60 namespace delimited_parser {
61 size_t find_beginning(const char* buffer,
62  size_t begin,
63  size_t end,
64  const import_export::CopyParams& copy_params) {
65  // @TODO(wei) line_delim is in quotes note supported
66  if (begin == 0 || (begin > 0 && buffer[begin - 1] == copy_params.line_delim)) {
67  return 0;
68  }
69  size_t i;
70  const char* buf = buffer + begin;
71  for (i = 0; i < end - begin; i++) {
72  if (buf[i] == copy_params.line_delim) {
73  return i + 1;
74  }
75  }
76  return i;
77 }
78 
79 size_t find_end(const char* buffer,
80  size_t size,
81  const import_export::CopyParams& copy_params,
82  unsigned int& num_rows_this_buffer) {
83  size_t last_line_delim_pos = 0;
84  if (copy_params.quoted) {
85  const char* current = buffer;
86  bool in_quote = false;
87 
88  while (current < buffer + size) {
89  while (!in_quote && current < buffer + size) {
90  // We are outside of quotes. We have to find the last possible line delimiter.
91  if (*current == copy_params.line_delim) {
92  last_line_delim_pos = current - buffer;
93  ++num_rows_this_buffer;
94  } else if (*current == copy_params.quote) {
95  in_quote = true;
96  }
97  ++current;
98  }
99 
100  while (in_quote && current < buffer + size) {
101  // We are in a quoted field. We have to find the ending quote.
102  if ((*current == copy_params.escape) && (current < buffer + size - 1) &&
103  (*(current + 1) == copy_params.quote)) {
104  ++current;
105  } else if (*current == copy_params.quote) {
106  in_quote = false;
107  }
108  ++current;
109  }
110  }
111  } else {
112  const char* current = buffer;
113  while (current < buffer + size) {
114  if (*current == copy_params.line_delim) {
115  last_line_delim_pos = current - buffer;
116  ++num_rows_this_buffer;
117  }
118  ++current;
119  }
120  }
121 
122  if (last_line_delim_pos <= 0) {
123  size_t slen = size < 50 ? size : 50;
124  std::string showMsgStr(buffer, buffer + slen);
125  LOG(ERROR) << "No line delimiter in block. Block was of size " << size
126  << " bytes, first few characters " << showMsgStr;
127  return size;
128  }
129 
130  return last_line_delim_pos + 1;
131 }
132 
133 template <typename T>
134 const char* get_row(const char* buf,
135  const char* buf_end,
136  const char* entire_buf_end,
137  const import_export::CopyParams& copy_params,
138  const bool* is_array,
139  std::vector<T>& row,
140  std::vector<std::unique_ptr<char[]>>& tmp_buffers,
141  bool& try_single_thread) {
142  const char* field = buf;
143  const char* p;
144  bool in_quote = false;
145  bool in_array = false;
146  bool has_escape = false;
147  bool strip_quotes = false;
148  try_single_thread = false;
149  for (p = buf; p < entire_buf_end; ++p) {
150  if (*p == copy_params.escape && p < entire_buf_end - 1 &&
151  *(p + 1) == copy_params.quote) {
152  p++;
153  has_escape = true;
154  } else if (copy_params.quoted && *p == copy_params.quote) {
155  in_quote = !in_quote;
156  if (in_quote) {
157  strip_quotes = true;
158  }
159  } else if (!in_quote && is_array != nullptr && *p == copy_params.array_begin &&
160  is_array[row.size()]) {
161  in_array = true;
162  while (p < entire_buf_end - 1) { // Array type will be parsed separately.
163  ++p;
164  if (*p == copy_params.array_end) {
165  in_array = false;
166  break;
167  }
168  }
169  } else if (*p == copy_params.delimiter || is_eol(*p, copy_params)) {
170  if (!in_quote) {
171  if (!has_escape && !strip_quotes) {
172  const char* field_end = p;
173  trim_space(field, field_end);
174  row.emplace_back(field, field_end - field);
175  } else {
176  tmp_buffers.emplace_back(std::make_unique<char[]>(p - field + 1));
177  auto field_buf = tmp_buffers.back().get();
178  int j = 0, i = 0;
179  for (; i < p - field; i++, j++) {
180  if (has_escape && field[i] == copy_params.escape &&
181  field[i + 1] == copy_params.quote) {
182  field_buf[j] = copy_params.quote;
183  i++;
184  } else {
185  field_buf[j] = field[i];
186  }
187  }
188  const char* field_begin = field_buf;
189  const char* field_end = field_buf + j;
190  trim_space(field_begin, field_end);
191  trim_quotes(field_begin, field_end, copy_params);
192  row.emplace_back(field_begin, field_end - field_begin);
193  }
194  field = p + 1;
195  has_escape = false;
196  strip_quotes = false;
197 
198  if (is_eol(*p, copy_params)) {
199  // We are at the end of the row. Skip the line endings now.
200  while (p + 1 < buf_end && is_eol(*(p + 1), copy_params)) {
201  p++;
202  }
203  break;
204  }
205  }
206  }
207  }
208  /*
209  @TODO(wei) do error handling
210  */
211  if (in_quote) {
212  LOG(ERROR) << "Unmatched quote.";
213  try_single_thread = true;
214  }
215  if (in_array) {
216  LOG(ERROR) << "Unmatched array.";
217  try_single_thread = true;
218  }
219  return p;
220 }
221 
222 template const char* get_row(const char* buf,
223  const char* buf_end,
224  const char* entire_buf_end,
225  const import_export::CopyParams& copy_params,
226  const bool* is_array,
227  std::vector<std::string>& row,
228  std::vector<std::unique_ptr<char[]>>& tmp_buffers,
229  bool& try_single_thread);
230 
231 template const char* get_row(const char* buf,
232  const char* buf_end,
233  const char* entire_buf_end,
234  const import_export::CopyParams& copy_params,
235  const bool* is_array,
236  std::vector<std::string_view>& row,
237  std::vector<std::unique_ptr<char[]>>& tmp_buffers,
238  bool& try_single_thread);
239 
240 void parse_string_array(const std::string& s,
241  const import_export::CopyParams& copy_params,
242  std::vector<std::string>& string_vec) {
243  if (s == copy_params.null_str || s == "NULL" || s.size() < 1 || s.empty()) {
244  // TODO: should not convert NULL, empty arrays to {"NULL"},
245  // need to support NULL, empty properly
246  string_vec.emplace_back("NULL");
247  return;
248  }
249  if (s[0] != copy_params.array_begin || s[s.size() - 1] != copy_params.array_end) {
250  throw std::runtime_error("Malformed Array :" + s);
251  }
252 
253  std::string row(s.c_str() + 1, s.length() - 2);
254  row.push_back('\n');
255  bool try_single_thread = false;
256  import_export::CopyParams array_params = copy_params;
257  array_params.delimiter = copy_params.array_delim;
258  std::vector<std::unique_ptr<char[]>> tmp_buffers;
259  get_row(row.c_str(),
260  row.c_str() + row.length(),
261  row.c_str() + row.length(),
262  array_params,
263  nullptr,
264  string_vec,
265  tmp_buffers,
266  try_single_thread);
267 
268  for (size_t i = 0; i < string_vec.size(); ++i) {
269  if (string_vec[i].empty()) { // Disallow empty strings for now
270  string_vec.erase(string_vec.begin() + i);
271  --i;
272  } else if (string_vec[i].size() > StringDictionary::MAX_STRLEN) {
273  throw std::runtime_error("Array String too long : " + string_vec[i] + " max is " +
275  }
276  }
277 }
278 
279 } // namespace delimited_parser
280 } // namespace import_export
bool is_eol(const char &c, const import_export::CopyParams &copy_params)
void trim_quotes(const char *&field_begin, const char *&field_end, const import_export::CopyParams &copy_params)
#define LOG(tag)
Definition: Logger.h:188
size_t find_beginning(const char *buffer, size_t begin, size_t end, const import_export::CopyParams &copy_params)
Finds the closest possible row beginning in the given buffer.
std::string to_string(char const *&&v)
const rapidjson::Value & field(const rapidjson::Value &obj, const char field[]) noexcept
Definition: JsonAccessors.h:31
size_t find_end(const char *buffer, size_t size, const import_export::CopyParams &copy_params, unsigned int &num_rows_this_buffer)
Finds the closest possible row ending to the end of the given buffer.
void trim_space(const char *&field_begin, const char *&field_end)
static constexpr size_t MAX_STRLEN
static const std::string trim_space(const char *field, const size_t len)
Definition: Importer.cpp:221
const char * get_row(const char *buf, const char *buf_end, const char *entire_buf_end, const import_export::CopyParams &copy_params, const bool *is_array, std::vector< T > &row, std::vector< std::unique_ptr< char[]>> &tmp_buffers, bool &try_single_thread)
Parses the first row in the given buffer and inserts fields into given vector.
void parse_string_array(const std::string &s, const import_export::CopyParams &copy_params, std::vector< std::string > &string_vec)
Parses given string array and inserts into given vector of strings.