OmniSciDB  fe05a0c208
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
DateTimeParser.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DateTimeParser.h"
18 #include "StringTransform.h"
19 
20 #include <boost/algorithm/string/predicate.hpp>
21 
22 #include <algorithm>
23 #include <array>
24 #include <cctype>
25 #include <charconv>
26 #include <limits>
27 #include <sstream>
28 #include <vector>
29 
30 namespace {
31 
32 constexpr std::array<int, 12> month_prefixes{{int('j') << 16 | int('a') << 8 | int('n'),
33  int('f') << 16 | int('e') << 8 | int('b'),
34  int('m') << 16 | int('a') << 8 | int('r'),
35  int('a') << 16 | int('p') << 8 | int('r'),
36  int('m') << 16 | int('a') << 8 | int('y'),
37  int('j') << 16 | int('u') << 8 | int('n'),
38  int('j') << 16 | int('u') << 8 | int('l'),
39  int('a') << 16 | int('u') << 8 | int('g'),
40  int('s') << 16 | int('e') << 8 | int('p'),
41  int('o') << 16 | int('c') << 8 | int('t'),
42  int('n') << 16 | int('o') << 8 | int('v'),
43  int('d') << 16 | int('e') << 8 | int('c')}};
44 
45 constexpr std::array<std::string_view, 13> month_suffixes{
46  {""
47  "uary",
48  "ruary",
49  "ch",
50  "il",
51  "",
52  "e",
53  "y",
54  "ust",
55  "tember",
56  "ober",
57  "ember",
58  "ember"}};
59 
60 constexpr unsigned
61  pow_10[10]{1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
62 
63 // Return y-m-d minus 1970-01-01 in days according to Gregorian calendar.
64 // Credit: http://howardhinnant.github.io/date_algorithms.html#days_from_civil
65 int64_t daysFromCivil(int64_t y, unsigned const m, unsigned const d) {
66  y -= m <= 2;
67  int64_t const era = (y < 0 ? y - 399 : y) / 400;
68  unsigned const yoe = static_cast<unsigned>(y - era * 400); // [0, 399]
69  unsigned const doy = (153 * (m + (m <= 2 ? 9 : -3)) + 2) / 5 + d - 1; // [0, 365]
70  unsigned const doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096]
71  return era * 146097 + static_cast<int64_t>(doe) - 719468;
72 }
73 
74 // Order of entries correspond to enum class FormatType { Date, Time, Timezone }.
75 std::vector<std::vector<std::string_view>> formatViews() {
76  return {{{"%Y-%m-%d", "%m/%d/%y", "%m/%d/%Y", "%Y/%m/%d", "%d-%b-%y", "%d/%b/%Y"},
77  {"%I:%M:%S %p",
78  "%H:%M:%S",
79  "%I:%M %p",
80  "%H:%M",
81  "%H%M%S",
82  "%I . %M . %S %p",
83  "%I %p"},
84  {"%z"}}};
85 }
86 
87 // Optionally eat month name after first 3 letters. Assume first 3 letters are correct.
88 void eatMonth(unsigned const month, std::string_view& str) {
89  str.remove_prefix(3);
90  std::string_view const suffix = month_suffixes[month];
91  if (boost::algorithm::istarts_with(str, suffix)) {
92  str.remove_prefix(suffix.size());
93  }
94 }
95 
96 void eatSpace(std::string_view& str) {
97  while (!str.empty() && isspace(str.front())) {
98  str.remove_prefix(1);
99  }
100 }
101 
102 // Parse str as a number of maxlen and type T.
103 // Return value and consume from str on success,
104 // otherwise return std::nullopt and do not change str.
105 template <typename T>
106 std::optional<T> fromChars(std::string_view& str,
107  size_t maxlen = std::numeric_limits<size_t>::max()) {
108  T retval;
109  maxlen = std::min(maxlen, str.size());
110  auto const result = std::from_chars(str.data(), str.data() + maxlen, retval);
111  if (result.ec == std::errc()) {
112  str.remove_prefix(result.ptr - str.data());
113  return retval;
114  } else {
115  return std::nullopt;
116  }
117 }
118 
119 std::optional<int64_t> unixTime(std::string_view const str) {
120  int64_t time{0};
121  auto const result = std::from_chars(str.data(), str.data() + str.size(), time);
122  // is_valid = str =~ /^-?\d+(\.\d*)$/
123  bool const is_valid = result.ec == std::errc() &&
124  (result.ptr == str.data() + str.size() ||
125  (*result.ptr == '.' &&
126  std::all_of(result.ptr + 1, str.data() + str.size(), isdigit)));
127  return is_valid ? std::make_optional(time) : std::nullopt;
128 }
129 
130 } // namespace
131 
132 // Interpret str according to DateTimeParser::FormatType::Time.
133 // Return number of (s,ms,us,ns) since midnight based on dim in (0,3,6,9) resp.
134 template <>
135 std::optional<int64_t> dateTimeParseOptional<kTIME>(std::string_view str,
136  unsigned const dim) {
137  if (!str.empty() && str.front() == 'T') {
138  str.remove_prefix(1);
139  }
142  std::optional<int64_t> time = parser.parse(str, dim);
143  if (!time) {
144  return std::nullopt;
145  }
146  // Parse optional timezone
147  std::string_view timezone = parser.unparsed();
149  std::optional<int64_t> tz = parser.parse(timezone, dim);
150  if (!parser.unparsed().empty()) {
151  return std::nullopt;
152  }
153  return *time + tz.value_or(0);
154 }
155 
156 // Interpret str according to DateTimeParser::FormatType::Date and Time.
157 // Return number of (s,ms,us,ns) since epoch based on dim in (0,3,6,9) resp.
158 template <>
159 std::optional<int64_t> dateTimeParseOptional<kTIMESTAMP>(std::string_view str,
160  unsigned const dim) {
161  if (!str.empty() && str.front() == 'T') {
162  str.remove_prefix(1);
163  }
165  // Parse date
167  std::optional<int64_t> date = parser.parse(str, dim);
168  if (!date) {
169  return unixTime(str);
170  }
171  // Parse time-of-day
172  std::string_view time_of_day = parser.unparsed();
173  if (time_of_day.empty()) {
174  return std::nullopt;
175  } else if (time_of_day.front() == 'T' || time_of_day.front() == ':') {
176  time_of_day.remove_prefix(1);
177  }
179  std::optional<int64_t> time = parser.parse(time_of_day, dim);
180  // Parse optional timezone
181  std::string_view timezone = parser.unparsed();
183  std::optional<int64_t> tz = parser.parse(timezone, dim);
184  return *date + time.value_or(0) + tz.value_or(0);
185 }
186 
187 // Interpret str according to DateTimeParser::FormatType::Date.
188 // Return number of (s,ms,us,ns) since epoch based on dim in (0,3,6,9) resp.
189 template <>
190 std::optional<int64_t> dateTimeParseOptional<kDATE>(std::string_view str,
191  unsigned const dim) {
193  // Parse date
195  std::optional<int64_t> date = parser.parse(str, dim);
196  if (!date) {
197  return unixTime(str);
198  }
199  // Parse optional timezone
200  std::string_view timezone = parser.unparsed();
202  std::optional<int64_t> tz = parser.parse(timezone, dim);
203  return *date + tz.value_or(0);
204 }
205 
206 // Return number of (s,ms,us,ns) since epoch based on dim in (0,3,6,9) resp.
207 int64_t DateTimeParser::DateTime::getTime(unsigned const dim) const {
208  int64_t const days = daysFromCivil(Y, m, d);
209  int const seconds =
210  static_cast<int>(3600 * H + 60 * M + S) - z +
211  (p ? *p && H != 12 ? 12 * 3600 : !*p && H == 12 ? -12 * 3600 : 0 : 0);
212  return (24 * 3600 * days + seconds) * pow_10[dim] + n / pow_10[9 - dim];
213 }
214 
215 // Return true if successful parse, false otherwise. Update dt_ and str.
216 // OK to be destructive to str on failed match.
217 bool DateTimeParser::parseWithFormat(std::string_view format, std::string_view& str) {
218  while (!format.empty()) {
219  if (format.front() == '%') {
220  eatSpace(str);
221  if (!updateDateTimeAndStr(format[1], str)) {
222  return false;
223  }
224  format.remove_prefix(2);
225  } else if (isspace(format.front())) {
226  eatSpace(format);
227  eatSpace(str);
228  } else if (!str.empty() && format.front() == str.front()) {
229  format.remove_prefix(1);
230  str.remove_prefix(1);
231  } else {
232  return false;
233  }
234  }
235  return true;
236 }
237 
238 // Update dt_ based on given str and current value of format_type_.
239 // Return number of (s,ms,us,ns) since epoch based on dim in (0,3,6,9) resp.
240 // or std::nullopt if no format matches str.
241 // In either case, update unparsed_ to the remaining part of str that was not matched.
242 std::optional<int64_t> DateTimeParser::parse(std::string_view const str, unsigned dim) {
243  static std::vector<std::vector<std::string_view>> const& format_views = formatViews();
244  auto const& formats = format_views.at(static_cast<int>(format_type_));
245  for (std::string_view const format : formats) {
246  std::string_view str_unparsed = str;
247  if (parseWithFormat(format, str_unparsed)) {
248  unparsed_ = str_unparsed;
249  return dt_.getTime(dim);
250  }
251  }
252  unparsed_ = str;
253  return std::nullopt;
254 }
255 
257  dt_ = DateTime();
258 }
259 
261  resetDateTime();
262  format_type_ = format_type;
263 }
264 
265 std::string_view DateTimeParser::unparsed() const {
266  return unparsed_;
267 }
268 
269 // Return true if successful parse, false otherwise. Update dt_ and str on success.
270 // OK to be destructive to str on failed parse.
271 bool DateTimeParser::updateDateTimeAndStr(char const field, std::string_view& str) {
272  switch (field) {
273  case 'Y':
274  if (auto const year = fromChars<int64_t>(str)) {
275  dt_.Y = *year;
276  return true;
277  }
278  return false;
279  case 'y':
280  // %y matches 1 or 2 digits. If 3 or more digits are provided,
281  // then it is considered an unsuccessful parse.
282  if (auto const year = fromChars<unsigned>(str)) {
283  if (*year < 69) {
284  dt_.Y = 2000 + *year;
285  return true;
286  } else if (*year < 100) {
287  dt_.Y = 1900 + *year;
288  return true;
289  }
290  }
291  return false;
292  case 'm':
293  if (auto const month = fromChars<unsigned>(str, 2)) {
294  if (1 <= *month && *month <= 12) {
295  dt_.m = *month;
296  return true;
297  }
298  }
299  return false;
300  case 'b':
301  if (3 <= str.size()) {
302  int const key =
303  std::tolower(str[0]) << 16 | std::tolower(str[1]) << 8 | std::tolower(str[2]);
304  constexpr auto end = month_prefixes.data() + month_prefixes.size();
305  // This is faster than a lookup into a std::unordered_map.
306  auto const ptr = std::find(month_prefixes.data(), end, key);
307  if (ptr != end) {
308  dt_.m = ptr - month_prefixes.data() + 1;
309  eatMonth(dt_.m, str);
310  return true;
311  }
312  }
313  return false;
314  case 'd':
315  if (auto const day = fromChars<unsigned>(str, 2)) {
316  if (1 <= *day && *day <= 31) {
317  dt_.d = *day;
318  return true;
319  }
320  }
321  return false;
322  case 'H':
323  if (auto const hour = fromChars<unsigned>(str, 2)) {
324  if (*hour <= 23) {
325  dt_.H = *hour;
326  return true;
327  }
328  }
329  return false;
330  case 'I':
331  if (auto const hour = fromChars<unsigned>(str, 2)) {
332  if (1 <= *hour && *hour <= 12) {
333  dt_.H = *hour;
334  return true;
335  }
336  }
337  return false;
338  case 'M':
339  if (auto const minute = fromChars<unsigned>(str, 2)) {
340  if (*minute <= 59) {
341  dt_.M = *minute;
342  return true;
343  }
344  }
345  return false;
346  case 'S':
347  if (auto const second = fromChars<unsigned>(str, 2)) {
348  if (*second <= 61) {
349  dt_.S = *second;
350  if (!str.empty() && str.front() == '.') {
351  str.remove_prefix(1);
352  size_t len = str.size();
353  if (auto const ns = fromChars<unsigned>(str, 9)) {
354  len -= str.size();
355  dt_.n = *ns * pow_10[9 - len];
356  } else {
357  return false; // Reject period not followed by a digit
358  }
359  }
360  return true;
361  }
362  }
363  return false;
364  case 'z':
365  // [-+]\d\d:?\d\d
366  if (5 <= str.size() && (str.front() == '-' || str.front() == '+') &&
367  isdigit(str[1]) && isdigit(str[2]) && isdigit(str[4]) &&
368  (str[3] == ':' ? 6 <= str.size() && isdigit(str[5]) : isdigit(str[3]))) {
369  char const* sep = &str[3];
370  int hours{0}, minutes{0};
371  std::from_chars(str.data() + 1, sep, hours);
372  sep += *sep == ':';
373  std::from_chars(sep, sep + 2, minutes);
374  dt_.z = (str.front() == '-' ? -60 : 60) * (60 * hours + minutes);
375  str.remove_prefix(sep - str.data() + 2);
376  return true;
377  }
378  return false;
379  case 'p':
380  // %p implies optional, so never return false
381  if (boost::algorithm::istarts_with(str, "am") ||
382  boost::algorithm::istarts_with(str, "pm") ||
383  boost::algorithm::istarts_with(str, "a.m.") ||
384  boost::algorithm::istarts_with(str, "p.m.")) {
385  dt_.p = std::tolower(str.front()) == 'p';
386  str.remove_prefix(std::tolower(str[1]) == 'm' ? 2 : 4);
387  } else {
388  dt_.p.reset();
389  }
390  return true;
391  default:
392  throw std::runtime_error(cat("Unrecognized format: %", field));
393  }
394 }
395 
396 std::ostream& operator<<(std::ostream& out, DateTimeParser::DateTime const& dt) {
397  return out << dt.Y << '-' << dt.m << '-' << dt.d << ' ' << dt.H << ':' << dt.M << ':'
398  << dt.S << '.' << dt.n << " p("
399  << (dt.p ? *dt.p ? "true" : "false" : "unset") << ") z(" << dt.z << ')';
400 }
std::optional< bool > p
std::optional< int64_t > parse(std::string_view const, unsigned dim)
std::string cat(Ts &&...args)
bool updateDateTimeAndStr(char const field, std::string_view &)
tuple d
Definition: test_fsi.py:9
std::optional< int64_t > unixTime(std::string_view const str)
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:53
std::string_view unparsed_
std::vector< std::vector< std::string_view > > formatViews()
void eatMonth(unsigned const month, std::string_view &str)
bool parseWithFormat(std::string_view format, std::string_view &str)
void eatSpace(std::string_view &str)
std::optional< int64_t > dateTimeParseOptional< kTIME >(std::string_view str, unsigned const dim)
const rapidjson::Value & field(const rapidjson::Value &obj, const char field[]) noexcept
Definition: JsonAccessors.h:31
std::optional< T > fromChars(std::string_view &str, size_t maxlen=std::numeric_limits< size_t >::max())
std::optional< int64_t > dateTimeParseOptional< kDATE >(std::string_view str, unsigned const dim)
std::optional< int64_t > dateTimeParseOptional< kTIMESTAMP >(std::string_view str, unsigned const dim)
int64_t getTime(unsigned const dim) const
constexpr std::array< int, 12 > month_prefixes
constexpr std::array< std::string_view, 13 > month_suffixes
FormatType format_type_
std::string suffix(SQLTypes type)
Definition: GeoIR.cpp:419
int64_t daysFromCivil(int64_t y, unsigned const m, unsigned const d)
void setFormatType(FormatType)
std::string_view unparsed() const