OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
DelimitedParserUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * @file DelimitedParserUtils.cpp
19  * @author Mehmet Sariyuce <mehmet.sariyuce@omnisci.com>
20  * @brief Implementation of DelimitedParserUtils class.
21  */
22 
24 
25 #include "Shared/Logger.h"
27 
28 namespace {
29 inline bool is_eol(const char& c, const Importer_NS::CopyParams& copy_params) {
30  return c == copy_params.line_delim || c == '\n' || c == '\r';
31 }
32 
33 inline void trim_space(const char*& field_begin, const char*& field_end) {
34  while (field_begin < field_end && (*field_begin == ' ' || *field_begin == '\r')) {
35  ++field_begin;
36  }
37  while (field_begin < field_end &&
38  (*(field_end - 1) == ' ' || *(field_end - 1) == '\r')) {
39  --field_end;
40  }
41 }
42 
43 inline void trim_quotes(const char*& field_begin,
44  const char*& field_end,
45  const Importer_NS::CopyParams& copy_params) {
46  if (copy_params.quoted && field_end - field_begin > 0 &&
47  *field_begin == copy_params.quote) {
48  ++field_begin;
49  }
50  if (copy_params.quoted && field_end - field_begin > 0 &&
51  *(field_end - 1) == copy_params.quote) {
52  --field_end;
53  }
54 }
55 } // namespace
56 
57 namespace Importer_NS {
58 size_t DelimitedParserUtils::find_beginning(const char* buffer,
59  size_t begin,
60  size_t end,
61  const Importer_NS::CopyParams& copy_params) {
62  // @TODO(wei) line_delim is in quotes note supported
63  if (begin == 0 || (begin > 0 && buffer[begin - 1] == copy_params.line_delim)) {
64  return 0;
65  }
66  size_t i;
67  const char* buf = buffer + begin;
68  for (i = 0; i < end - begin; i++) {
69  if (buf[i] == copy_params.line_delim) {
70  return i + 1;
71  }
72  }
73  return i;
74 }
75 
76 size_t DelimitedParserUtils::find_end(const char* buffer,
77  size_t size,
78  const Importer_NS::CopyParams& copy_params,
79  unsigned int& num_rows_this_buffer) {
80  size_t last_line_delim_pos = 0;
81  if (copy_params.quoted) {
82  const char* current = buffer;
83  bool in_quote = false;
84 
85  while (current < buffer + size) {
86  while (!in_quote && current < buffer + size) {
87  // We are outside of quotes. We have to find the last possible line delimiter.
88  if (*current == copy_params.line_delim) {
89  last_line_delim_pos = current - buffer;
90  ++num_rows_this_buffer;
91  } else if (*current == copy_params.quote) {
92  in_quote = true;
93  }
94  ++current;
95  }
96 
97  while (in_quote && current < buffer + size) {
98  // We are in a quoted field. We have to find the ending quote.
99  if ((*current == copy_params.escape) && (current < buffer + size - 1) &&
100  (*(current + 1) == copy_params.quote)) {
101  ++current;
102  } else if (*current == copy_params.quote) {
103  in_quote = false;
104  }
105  ++current;
106  }
107  }
108  } else {
109  const char* current = buffer;
110  while (current < buffer + size) {
111  if (*current == copy_params.line_delim) {
112  last_line_delim_pos = current - buffer;
113  ++num_rows_this_buffer;
114  }
115  ++current;
116  }
117  }
118 
119  if (last_line_delim_pos <= 0) {
120  size_t slen = size < 50 ? size : 50;
121  std::string showMsgStr(buffer, buffer + slen);
122  LOG(ERROR) << "No line delimiter in block. Block was of size " << size
123  << " bytes, first few characters " << showMsgStr;
124  return size;
125  }
126 
127  return last_line_delim_pos + 1;
128 }
129 
130 const char* DelimitedParserUtils::get_row(const char* buf,
131  const char* buf_end,
132  const char* entire_buf_end,
133  const Importer_NS::CopyParams& copy_params,
134  const bool* is_array,
135  std::vector<std::string>& row,
136  bool& try_single_thread) {
137  const char* field = buf;
138  const char* p;
139  bool in_quote = false;
140  bool in_array = false;
141  bool has_escape = false;
142  bool strip_quotes = false;
143  try_single_thread = false;
144  for (p = buf; p < entire_buf_end; ++p) {
145  if (*p == copy_params.escape && p < entire_buf_end - 1 &&
146  *(p + 1) == copy_params.quote) {
147  p++;
148  has_escape = true;
149  } else if (copy_params.quoted && *p == copy_params.quote) {
150  in_quote = !in_quote;
151  if (in_quote) {
152  strip_quotes = true;
153  }
154  } else if (!in_quote && is_array != nullptr && *p == copy_params.array_begin &&
155  is_array[row.size()]) {
156  in_array = true;
157  while (p < entire_buf_end - 1) { // Array type will be parsed separately.
158  ++p;
159  if (*p == copy_params.array_end) {
160  in_array = false;
161  break;
162  }
163  }
164  } else if (*p == copy_params.delimiter || is_eol(*p, copy_params)) {
165  if (!in_quote) {
166  if (!has_escape && !strip_quotes) {
167  const char* field_end = p;
168  trim_space(field, field_end);
169  row.emplace_back(field, field_end - field);
170  } else {
171  auto field_buf = std::make_unique<char[]>(p - field + 1);
172  int j = 0, i = 0;
173  for (; i < p - field; i++, j++) {
174  if (has_escape && field[i] == copy_params.escape &&
175  field[i + 1] == copy_params.quote) {
176  field_buf[j] = copy_params.quote;
177  i++;
178  } else {
179  field_buf[j] = field[i];
180  }
181  }
182  const char* field_begin = field_buf.get();
183  const char* field_end = field_buf.get() + j;
184  trim_space(field_begin, field_end);
185  trim_quotes(field_begin, field_end, copy_params);
186  row.emplace_back(field_begin, field_end - field_begin);
187  }
188  field = p + 1;
189  has_escape = false;
190  strip_quotes = false;
191 
192  if (is_eol(*p, copy_params)) {
193  // We are at the end of the row. Skip the line endings now.
194  while (p + 1 < buf_end && is_eol(*(p + 1), copy_params)) {
195  p++;
196  }
197  break;
198  }
199  }
200  }
201  }
202  /*
203  @TODO(wei) do error handling
204  */
205  if (in_quote) {
206  LOG(ERROR) << "Unmatched quote.";
207  try_single_thread = true;
208  }
209  if (in_array) {
210  LOG(ERROR) << "Unmatched array.";
211  try_single_thread = true;
212  }
213  return p;
214 }
215 
216 void DelimitedParserUtils::parseStringArray(const std::string& s,
217  const Importer_NS::CopyParams& copy_params,
218  std::vector<std::string>& string_vec) {
219  if (s == copy_params.null_str || s == "NULL" || s.size() < 1 || s.empty()) {
220  // TODO: should not convert NULL, empty arrays to {"NULL"},
221  // need to support NULL, empty properly
222  string_vec.emplace_back("NULL");
223  return;
224  }
225  if (s[0] != copy_params.array_begin || s[s.size() - 1] != copy_params.array_end) {
226  throw std::runtime_error("Malformed Array :" + s);
227  }
228 
229  std::string row(s.c_str() + 1, s.length() - 2);
230  row.push_back('\n');
231  bool try_single_thread = false;
232  Importer_NS::CopyParams array_params = copy_params;
233  array_params.delimiter = copy_params.array_delim;
234  get_row(row.c_str(),
235  row.c_str() + row.length(),
236  row.c_str() + row.length(),
237  array_params,
238  nullptr,
239  string_vec,
240  try_single_thread);
241 
242  for (size_t i = 0; i < string_vec.size(); ++i) {
243  if (string_vec[i].empty()) { // Disallow empty strings for now
244  string_vec.erase(string_vec.begin() + i);
245  --i;
246  } else if (string_vec[i].size() > StringDictionary::MAX_STRLEN) {
247  throw std::runtime_error("Array String too long : " + string_vec[i] + " max is " +
249  }
250  }
251 }
252 } // namespace Importer_NS
static size_t find_end(const char *buffer, size_t size, const CopyParams &copy_params, unsigned int &num_rows_this_buffer)
Finds the closest possible row ending to the end of the given buffer.
std::string null_str
Definition: CopyParams.h:47
static const char * get_row(const char *buf, const char *buf_end, const char *entire_buf_end, const Importer_NS::CopyParams &copy_params, const bool *is_array, std::vector< std::string > &row, bool &try_single_thread)
Parses the first row in the given buffer and inserts fields into given vector.
void trim_quotes(const char *&field_begin, const char *&field_end, const Importer_NS::CopyParams &copy_params)
#define LOG(tag)
Definition: Logger.h:185
static const std::string trim_space(const char *field, const size_t len)
Definition: Importer.cpp:220
std::string to_string(char const *&&v)
const rapidjson::Value & field(const rapidjson::Value &obj, const char field[]) noexcept
Definition: JsonAccessors.h:31
void trim_space(const char *&field_begin, const char *&field_end)
static size_t find_beginning(const char *buffer, size_t begin, size_t end, const CopyParams &copy_params)
Finds the closest possible row beginning in the given buffer.
static constexpr size_t MAX_STRLEN
static void parseStringArray(const std::string &s, const Importer_NS::CopyParams &copy_params, std::vector< std::string > &string_vec)
Parses given string array and inserts into given vector of strings.
bool is_eol(const char &c, const Importer_NS::CopyParams &copy_params)