OmniSciDB  ca0c39ec8f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOps.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "StringOps.h"
18 #include "Shared/base64.h"
19 
20 #include <rapidjson/document.h>
21 #include <boost/algorithm/string/predicate.hpp>
22 
23 namespace StringOps_Namespace {
24 
25 boost::regex StringOp::generateRegex(const std::string& op_name,
26  const std::string& regex_pattern,
27  const std::string& regex_params,
28  const bool supports_sub_matches) {
29  bool is_case_sensitive = false;
30  bool is_case_insensitive = false;
31 
32  for (const auto& c : regex_params) {
33  switch (c) {
34  case 'c':
35  is_case_sensitive = true;
36  break;
37  case 'i':
38  is_case_insensitive = true;
39  break;
40  case 'e': {
41  if (!supports_sub_matches) {
42  throw std::runtime_error(op_name +
43  " does not support 'e' (sub-matches) option.");
44  }
45  // We use e to set sub-expression group in a separate initializer
46  // but need to have this entry to not error on the default path
47  break;
48  }
49  default: {
50  if (supports_sub_matches) {
51  throw std::runtime_error("Unrecognized regex parameter for " + op_name +
52  ", expected either 'c' 'i', or 'e'.");
53  }
54  throw std::runtime_error("Unrecognized regex parameter for " + op_name +
55  ", expected either 'c' or 'i'.");
56  }
57  }
58  }
59  if (!is_case_sensitive && !is_case_insensitive) {
60  throw std::runtime_error(op_name +
61  " params must either specify case-sensitivity ('c') or "
62  "case-insensitivity ('i').");
63  }
64  if (is_case_sensitive && is_case_insensitive) {
65  throw std::runtime_error(op_name +
66  " params cannot specify both case-sensitivity ('c') and "
67  "case-insensitivity ('i').");
68  }
69  if (is_case_insensitive) {
70  return boost::regex(regex_pattern,
71  boost::regex_constants::extended |
72  boost::regex_constants::optimize |
73  boost::regex_constants::icase);
74  } else {
75  return boost::regex(
76  regex_pattern,
77  boost::regex_constants::extended | boost::regex_constants::optimize);
78  }
79 }
80 
81 NullableStrType TryStringCast::operator()(const std::string& str) const {
82  CHECK("Invalid string output for TryStringCast");
83  return NullableStrType();
84 }
85 
86 Datum TryStringCast::numericEval(const std::string_view str) const {
87  if (str.empty()) {
88  return NullDatum(return_ti_);
89  }
90  // Need to make copy for now b/c StringToDatum can mod SQLTypeInfo arg
91  SQLTypeInfo return_ti(return_ti_);
92  try {
93  return StringToDatum(str, return_ti);
94  } catch (std::runtime_error& e) {
95  return NullDatum(return_ti);
96  }
97 }
98 
99 NullableStrType Lower::operator()(const std::string& str) const {
100  std::string output_str(str);
102  output_str.begin(), output_str.end(), output_str.begin(), [](unsigned char c) {
103  return std::tolower(c);
104  });
105  return output_str;
106 }
107 
108 NullableStrType Upper::operator()(const std::string& str) const {
109  std::string output_str(str);
111  output_str.begin(), output_str.end(), output_str.begin(), [](unsigned char c) {
112  return std::toupper(c);
113  });
114  return output_str;
115 }
116 
117 NullableStrType InitCap::operator()(const std::string& str) const {
118  std::string output_str(str);
119  bool last_char_whitespace = true; // Beginning of string counts as whitespace
120  for (auto& c : output_str) {
121  if (isspace(c) || delimiter_bitmap_[reinterpret_cast<const uint8_t&>(c)]) {
122  last_char_whitespace = true;
123  continue;
124  }
125  if (last_char_whitespace) {
126  c = toupper(c);
127  last_char_whitespace = false;
128  } else {
129  c = tolower(c);
130  }
131  }
132  return output_str;
133 }
134 
135 NullableStrType Reverse::operator()(const std::string& str) const {
136  const std::string reversed_str = std::string(str.rbegin(), str.rend());
137  return reversed_str;
138 }
139 
140 NullableStrType Repeat::operator()(const std::string& str) const {
141  std::string repeated_str;
142  repeated_str.reserve(str.size() * n_);
143  for (size_t r = 0; r < n_; ++r) {
144  repeated_str += str;
145  }
146  return repeated_str;
147 }
148 
149 NullableStrType Concat::operator()(const std::string& str) const {
150  return reverse_order_ ? str_literal_ + str : str + str_literal_;
151 }
152 
153 NullableStrType Pad::operator()(const std::string& str) const {
154  return pad_mode_ == Pad::PadMode::LEFT ? lpad(str) : rpad(str);
155 }
156 
157 std::string Pad::lpad(const std::string& str) const {
158  const auto str_len = str.size();
159  const size_t chars_to_fill = str_len < padded_length_ ? padded_length_ - str_len : 0UL;
160  if (chars_to_fill == 0UL) {
161  return str.substr(0, padded_length_);
162  }
163  // If here we need to add characters from the padding_string_
164  // to fill the difference between str_len and padded_length_
165  if (padding_string_length_ == 1UL) {
166  return std::string(chars_to_fill, padding_char_) + str;
167  }
168 
169  std::string fitted_padding_str;
170  fitted_padding_str.reserve(chars_to_fill);
171  for (size_t i = 0; i < chars_to_fill; ++i) {
172  fitted_padding_str.push_back(padding_string_[i % padding_string_length_]);
173  }
174  return fitted_padding_str + str;
175 }
176 
177 std::string Pad::rpad(const std::string& str) const {
178  const auto str_len = str.size();
179  const size_t chars_to_fill = str_len < padded_length_ ? padded_length_ - str_len : 0UL;
180  if (chars_to_fill == 0UL) {
181  return str.substr(str_len - padded_length_, std::string::npos);
182  }
183  // If here we need to add characters from the padding_string_
184  // to fill the difference between str_len and padded_length_
185  if (padding_string_length_ == 1UL) {
186  return str + std::string(chars_to_fill, padding_char_);
187  }
188 
189  std::string fitted_padding_str;
190  fitted_padding_str.reserve(chars_to_fill);
191  for (size_t i = 0; i < chars_to_fill; ++i) {
192  fitted_padding_str.push_back(padding_string_[i % padding_string_length_]);
193  }
194  return str + fitted_padding_str;
195 }
196 
197 Pad::PadMode Pad::op_kind_to_pad_mode(const SqlStringOpKind op_kind) {
198  switch (op_kind) {
200  return PadMode::LEFT;
202  return PadMode::RIGHT;
203  default:
204  UNREACHABLE();
205  // Not reachable, but make compiler happy
206  return PadMode::LEFT;
207  };
208 }
209 
210 NullableStrType Trim::operator()(const std::string& str) const {
211  const auto str_len = str.size();
212  size_t trim_begin = 0;
213  if (trim_mode_ == TrimMode::LEFT || trim_mode_ == TrimMode::BOTH) {
214  while (trim_begin < str_len &&
215  trim_char_bitmap_[reinterpret_cast<const uint8_t&>(str[trim_begin])]) {
216  ++trim_begin;
217  }
218  }
219  size_t trim_end = str_len - 1;
220  if (trim_mode_ == TrimMode::RIGHT || trim_mode_ == TrimMode::BOTH) {
221  while (trim_end > trim_begin &&
222  trim_char_bitmap_[reinterpret_cast<const uint8_t&>(str[trim_end])]) {
223  --trim_end;
224  }
225  }
226  if (trim_begin == 0 && trim_end == str_len - 1) {
227  return str;
228  }
229  return str.substr(trim_begin, trim_end + 1 - trim_begin);
230 }
231 
232 Trim::TrimMode Trim::op_kind_to_trim_mode(const SqlStringOpKind op_kind) {
233  switch (op_kind) {
235  return Trim::TrimMode::BOTH;
237  return Trim::TrimMode::LEFT;
239  return Trim::TrimMode::RIGHT;
240  default:
241  UNREACHABLE();
242  // Not reachable, but make compiler happy
243  return Trim::TrimMode::BOTH;
244  };
245 }
246 
247 NullableStrType Substring::operator()(const std::string& str) const {
248  // If start_ is negative then we start abs(start_) characters from the end
249  // of the string
250  const int64_t str_len = str.size();
251  const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
252  const size_t capped_start =
253  wrapped_start > str_len ? str_len : (wrapped_start < 0 ? 0 : wrapped_start);
254  return str.substr(capped_start, length_);
255 }
256 
257 NullableStrType Overlay::operator()(const std::string& base_str) const {
258  // If start_ is negative then we start abs(start_) characters from the end
259  // of the string
260  const int64_t str_len = base_str.size();
261  const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
262  const size_t capped_start =
263  wrapped_start > str_len ? str_len : (wrapped_start < 0 ? 0 : wrapped_start);
264  std::string replaced_str = base_str.substr(0, capped_start);
265  replaced_str += insert_str_;
266  const size_t remainder_start =
267  std::min(wrapped_start + replacement_length_, size_t(str_len));
268  const size_t remainder_length = static_cast<size_t>(str_len) - remainder_start;
269  replaced_str += base_str.substr(remainder_start, remainder_length);
270  return replaced_str;
271 }
272 
273 NullableStrType Replace::operator()(const std::string& str) const {
274  std::string replaced_str(str);
275 
276  size_t search_start_index = 0;
277  while (true) {
278  search_start_index = replaced_str.find(pattern_str_, search_start_index);
279  if (search_start_index == std::string::npos) {
280  break;
281  }
282  replaced_str.replace(search_start_index, pattern_str_len_, replacement_str_);
283  search_start_index += replacement_str_len_;
284  }
285  return replaced_str;
286 }
287 
288 NullableStrType SplitPart::operator()(const std::string& str) const {
289  // If split_part_ is negative then it is taken as the number
290  // of split parts from the end of the string
291 
292  if (delimiter_ == "") {
293  return str;
294  }
295 
296  const size_t str_len = str.size();
297  size_t delimiter_pos = reverse_ ? str_len : 0UL;
298  size_t last_delimiter_pos;
299  size_t delimiter_idx = 0UL;
300 
301  do {
302  last_delimiter_pos = delimiter_pos;
303  delimiter_pos = reverse_ ? str.rfind(delimiter_, delimiter_pos - 1UL)
304  : str.find(delimiter_, delimiter_pos + delimiter_length_);
305  } while (delimiter_pos != std::string::npos && ++delimiter_idx < split_part_);
306 
307  if (delimiter_idx == 0UL && split_part_ == 1UL) {
308  // No delimiter was found, but the first match is requested, which here is
309  // the whole string
310  return str;
311  }
312 
313  if (delimiter_pos == std::string::npos &&
314  (delimiter_idx < split_part_ - 1UL || delimiter_idx < 1UL)) {
315  // split_part_ was out of range
316  return NullableStrType(); // null string
317  }
318 
319  if (reverse_) {
320  const size_t substr_start =
321  delimiter_pos == std::string::npos ? 0UL : delimiter_pos + delimiter_length_;
322  return str.substr(substr_start, last_delimiter_pos - substr_start);
323  } else {
324  const size_t substr_start =
325  split_part_ == 1UL ? 0UL : last_delimiter_pos + delimiter_length_;
326  return str.substr(substr_start, delimiter_pos - substr_start);
327  }
328 }
329 
330 NullableStrType RegexpReplace::operator()(const std::string& str) const {
331  const int64_t str_len = str.size();
332  const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
333  const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
334  if (occurrence_ == 0L) {
335  std::string result;
336  std::string::const_iterator replace_start(str.cbegin() + wrapped_start);
337  boost::regex_replace(std::back_inserter(result),
338  replace_start,
339  str.cend(),
340  regex_pattern_,
341  replacement_);
342  return str.substr(0UL, wrapped_start) + result;
343  } else {
344  const auto occurrence_match_pos = RegexpReplace::get_nth_regex_match(
345  str,
346  wrapped_start,
347  regex_pattern_,
348  occurrence_ > 0 ? occurrence_ - 1 : occurrence_);
349  if (occurrence_match_pos.first == std::string::npos) {
350  // No match found, return original string
351  return str;
352  }
353  std::string result;
354  std::string::const_iterator replace_start(str.cbegin() + occurrence_match_pos.first);
355  std::string::const_iterator replace_end(str.cbegin() + occurrence_match_pos.second);
356  std::string replaced_match;
357  boost::regex_replace(std::back_inserter(replaced_match),
358  replace_start,
359  replace_end,
360  regex_pattern_,
361  replacement_);
362  return str.substr(0UL, occurrence_match_pos.first) + replaced_match +
363  str.substr(occurrence_match_pos.second, std::string::npos);
364  }
365 }
366 
367 std::pair<size_t, size_t> RegexpReplace::get_nth_regex_match(
368  const std::string& str,
369  const size_t start_pos,
370  const boost::regex& regex_pattern,
371  const int64_t occurrence) {
372  std::vector<std::pair<size_t, size_t>> regex_match_positions;
373  std::string::const_iterator search_start(str.cbegin() + start_pos);
374  boost::smatch match;
375  int64_t match_idx = 0;
376  size_t string_pos = start_pos;
377  while (boost::regex_search(search_start, str.cend(), match, regex_pattern)) {
378  string_pos += match.position(size_t(0)) + match.length(0);
379  regex_match_positions.emplace_back(
380  std::make_pair(string_pos - match.length(0), string_pos));
381  if (match_idx++ == occurrence) {
382  return regex_match_positions.back();
383  }
384  search_start =
385  match.suffix().first; // Move to position after last char of matched string
386  // Position is relative to last match/initial iterator, so need to increment our
387  // string_pos accordingly
388  }
389  // occurrence only could have a valid match if negative here,
390  // but don't want to check in inner loop for performance reasons
391  const int64_t wrapped_match = occurrence >= 0 ? occurrence : match_idx + occurrence;
392  if (wrapped_match < 0 || wrapped_match >= match_idx) {
393  // Represents a non-match
394  return std::make_pair(std::string::npos, std::string::npos);
395  }
396  return regex_match_positions[wrapped_match];
397 }
398 
399 NullableStrType RegexpSubstr::operator()(const std::string& str) const {
400  const int64_t str_len = str.size();
401  const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
402  const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
403  int64_t match_idx = 0;
404  // Apears std::regex_search does not support string_view?
405  std::vector<std::string> regex_matches;
406  std::string::const_iterator search_start(str.cbegin() + wrapped_start);
407  boost::smatch match;
408  while (boost::regex_search(search_start, str.cend(), match, regex_pattern_)) {
409  if (match_idx++ == occurrence_) {
410  if (sub_match_info_.first) {
411  return RegexpSubstr::get_sub_match(match, sub_match_info_);
412  }
413  return NullableStrType(match[0]);
414  }
415  regex_matches.emplace_back(match[0]);
416  search_start =
417  match.suffix().first; // Move to position after last char of matched string
418  }
419  const int64_t wrapped_match = occurrence_ >= 0 ? occurrence_ : match_idx + occurrence_;
420  if (wrapped_match < 0 || wrapped_match >= match_idx) {
421  return NullableStrType();
422  }
423  if (sub_match_info_.first) {
424  return RegexpSubstr::get_sub_match(match, sub_match_info_);
425  }
426  return regex_matches[wrapped_match];
427 }
428 
429 std::string RegexpSubstr::get_sub_match(const boost::smatch& match,
430  const std::pair<bool, int64_t> sub_match_info) {
431  const int64_t num_sub_matches = match.size() - 1;
432  const int64_t wrapped_sub_match = sub_match_info.second >= 0
433  ? sub_match_info.second
434  : num_sub_matches + sub_match_info.second;
435  if (wrapped_sub_match < 0 || wrapped_sub_match >= num_sub_matches) {
436  return "";
437  }
438  return match[wrapped_sub_match + 1];
439 }
440 
441 std::pair<bool, int64_t> RegexpSubstr::set_sub_match_info(
442  const std::string& regex_pattern,
443  const int64_t sub_match_group_idx) {
444  if (regex_pattern.find("e", 0UL) == std::string::npos) {
445  return std::make_pair(false, 0UL);
446  }
447  return std::make_pair(
448  true, sub_match_group_idx > 0L ? sub_match_group_idx - 1 : sub_match_group_idx);
449 }
450 
451 // json_path must start with "lax $", "strict $" or "$" (case-insensitive).
452 JsonValue::JsonParseMode JsonValue::parse_json_parse_mode(std::string_view json_path) {
453  size_t const string_pos = json_path.find('$');
454  if (string_pos == 0) {
455  // Parsing mode was not explicitly specified, default to PARSE_MODE_LAX
456  return JsonValue::JsonParseMode::PARSE_MODE_LAX;
457  } else if (string_pos == std::string::npos) {
458  throw std::runtime_error("JSON search path must include a '$' literal.");
459  }
460  std::string_view const prefix = json_path.substr(0, string_pos);
461  if (boost::iequals(prefix, std::string_view("lax "))) {
462  return JsonValue::JsonParseMode::PARSE_MODE_LAX;
463  } else if (boost::iequals(prefix, std::string_view("strict "))) {
464  if constexpr (JsonValue::allow_strict_json_parsing) {
465  return JsonValue::JsonParseMode::PARSE_MODE_STRICT;
466  } else {
467  throw std::runtime_error("Strict parsing not currently supported for JSON_VALUE.");
468  }
469  } else {
470  throw std::runtime_error("Issue parsing JSON_VALUE Parse Mode.");
471  }
472 }
473 
474 std::vector<JsonValue::JsonKey> JsonValue::parse_json_path(const std::string& json_path) {
475  // Assume that parse_key_error_mode validated strict/lax mode
476  size_t string_pos = json_path.find("$");
477  if (string_pos == std::string::npos) {
478  throw std::runtime_error("JSON search path must begin with '$' literal.");
479  }
480  string_pos += 1; // Go to next character after $
481 
482  // Use tildas to enclose escaped regex string due to embedded ')"'
483  static const auto& key_regex = *new boost::regex(
484  R"~(^(\.(([[:alpha:]][[:alnum:]_-]*)|"([[:alpha:]][ [:alnum:]_-]*)"))|\[([[:digit:]]+)\])~",
485  boost::regex_constants::extended | boost::regex_constants::optimize);
486  static_assert(std::is_trivially_destructible_v<decltype(key_regex)>);
487 
488  std::string::const_iterator search_start(json_path.cbegin() + string_pos);
489  boost::smatch match;
490  std::vector<JsonKey> json_keys;
491  while (boost::regex_search(search_start, json_path.cend(), match, key_regex)) {
492  CHECK_EQ(match.size(), 6UL);
493  if (match.position(size_t(0)) != 0L) {
494  // Match wasn't found at beginning of string
495  throw std::runtime_error("JSON search path parsing error: '" + json_path + "'");
496  }
497  size_t matching_expr = 0;
498  if (match[3].matched) {
499  // simple object key
500  matching_expr = 3;
501  } else if (match[4].matched) {
502  // complex object key
503  matching_expr = 4;
504  } else if (match[5].matched) {
505  // array key
506  matching_expr = 5;
507  }
508  CHECK_GT(matching_expr, 0UL);
509  string_pos += match.length(0);
510 
511  const std::string key_match(match[matching_expr].first, match[matching_expr].second);
512  CHECK_GE(key_match.length(), 1UL);
513  if (isalpha(key_match[0])) {
514  // Object key
515  json_keys.emplace_back(JsonKey(key_match));
516  } else {
517  // Array key
518  json_keys.emplace_back(JsonKey(std::stoi(key_match)));
519  }
520  search_start =
521  match.suffix().first; // Move to position after last char of matched string
522  }
523  if (json_keys.empty()) {
524  throw std::runtime_error("No keys found in JSON search path.");
525  }
526  if (string_pos < json_path.size()) {
527  throw std::runtime_error("JSON path parsing error.");
528  }
529  return json_keys;
530 }
531 
532 NullableStrType JsonValue::operator()(const std::string& str) const {
533  rapidjson::Document document;
534  if (document.Parse(str.c_str()).HasParseError()) {
535  if constexpr (JsonValue::allow_strict_json_parsing) {
536  return handle_parse_error(str);
537  } else {
538  return NullableStrType();
539  }
540  }
541  rapidjson::Value& json_val = document;
542  for (const auto& json_key : json_keys_) {
543  switch (json_key.key_kind) {
544  case JsonKeyKind::JSON_OBJECT: {
545  if (!json_val.IsObject() || !json_val.HasMember(json_key.object_key)) {
546  if constexpr (JsonValue::allow_strict_json_parsing) {
547  return handle_key_error(str);
548  } else {
549  return NullableStrType();
550  }
551  }
552  json_val = json_val[json_key.object_key];
553  break;
554  }
555  case JsonKeyKind::JSON_ARRAY: {
556  if (!json_val.IsArray() || json_val.Size() <= json_key.array_key) {
557  if constexpr (JsonValue::allow_strict_json_parsing) {
558  return handle_key_error(str);
559  } else {
560  return NullableStrType();
561  }
562  }
563  json_val = json_val[json_key.array_key];
564  break;
565  }
566  }
567  }
568  // Now get value as string
569  if (json_val.IsString()) {
570  return NullableStrType(std::string(json_val.GetString()));
571  } else if (json_val.IsNumber()) {
572  if (json_val.IsDouble()) {
573  return NullableStrType(std::to_string(json_val.GetDouble()));
574  } else if (json_val.IsInt64()) {
575  return NullableStrType(std::to_string(json_val.GetInt64()));
576  } else if (json_val.IsUint64()) {
577  // Need to cover range of uint64 that can't fit int in64
578  return NullableStrType(std::to_string(json_val.GetUint64()));
579  } else {
580  // A bit defensive, as I'm fairly sure json does not
581  // support numeric types with widths > 64 bits, so may drop
582  if constexpr (JsonValue::allow_strict_json_parsing) {
583  return handle_key_error(str);
584  } else {
585  return NullableStrType();
586  }
587  }
588  } else if (json_val.IsBool()) {
589  return NullableStrType(std::string(json_val.IsTrue() ? "true" : "false"));
590  } else if (json_val.IsNull()) {
591  return NullableStrType();
592  } else {
593  // For any unhandled type - we may move this to a CHECK after gaining
594  // more confidence in prod
595  if constexpr (JsonValue::allow_strict_json_parsing) {
596  return handle_key_error(str);
597  } else {
598  return NullableStrType();
599  }
600  }
601 }
602 
603 NullableStrType Base64Encode::operator()(const std::string& str) const {
604  return shared::encode_base64(str);
605 }
606 
607 NullableStrType Base64Decode::operator()(const std::string& str) const {
608  return shared::decode_base64(str);
609 }
610 
611 std::string StringOps::operator()(const std::string& str) const {
612  NullableStrType modified_str(str);
613  for (const auto& string_op : string_ops_) {
614  modified_str = string_op->operator()(modified_str.str);
615  if (modified_str.is_null) {
616  return ""; // How we currently represent dictionary-encoded nulls
617  }
618  }
619  return modified_str.str;
620 }
621 
622 std::string_view StringOps::operator()(const std::string_view sv,
623  std::string& sv_storage) const {
624  sv_storage = sv;
625  NullableStrType nullable_str(sv);
626  for (const auto& string_op : string_ops_) {
627  nullable_str = string_op->operator()(nullable_str.str);
628  if (nullable_str.is_null) {
629  return "";
630  }
631  }
632  sv_storage = nullable_str.str;
633  return sv_storage;
634 }
635 
636 Datum StringOps::numericEval(const std::string_view str) const {
637  NullableStrType modified_str(str);
638  const auto num_string_producing_ops = string_ops_.size() - 1;
639  for (size_t string_op_idx = 0; string_op_idx < num_string_producing_ops;
640  ++string_op_idx) {
641  const auto& string_op = string_ops_[string_op_idx];
642  modified_str = string_op->operator()(modified_str.str);
643  if (modified_str.is_null) {
644  break;
645  }
646  }
647  return string_ops_.back()->numericEval(modified_str.str);
648 }
649 
650 std::vector<std::unique_ptr<const StringOp>> StringOps::genStringOpsFromOpInfos(
651  const std::vector<StringOpInfo>& string_op_infos) const {
652  // Should we handle pure literal expressions here as well
653  // even though they are currently rewritten to string literals?
654  std::vector<std::unique_ptr<const StringOp>> string_ops;
655  string_ops.reserve(string_op_infos.size());
656  for (const auto& string_op_info : string_op_infos) {
657  string_ops.emplace_back(gen_string_op(string_op_info));
658  }
659  return string_ops;
660 }
661 
662 // Free functions follow
663 
664 std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info) {
665  std::optional<std::string> var_string_optional_literal;
666  const auto op_kind = string_op_info.getOpKind();
667  const auto& return_ti = string_op_info.getReturnType();
668 
669  if (string_op_info.hasNullLiteralArg()) {
670  return std::make_unique<const NullOp>(var_string_optional_literal, op_kind);
671  }
672 
673  const auto num_non_variable_literals = string_op_info.numNonVariableLiterals();
674  if (string_op_info.hasVarStringLiteral()) {
675  CHECK_EQ(num_non_variable_literals + 1UL, string_op_info.numLiterals());
676  var_string_optional_literal = string_op_info.getStringLiteral(0);
677  }
678 
679  switch (op_kind) {
680  case SqlStringOpKind::LOWER: {
681  CHECK_EQ(num_non_variable_literals, 0UL);
682  return std::make_unique<const Lower>(var_string_optional_literal);
683  }
684  case SqlStringOpKind::UPPER: {
685  CHECK_EQ(num_non_variable_literals, 0UL);
686  return std::make_unique<const Upper>(var_string_optional_literal);
687  }
689  CHECK_EQ(num_non_variable_literals, 0UL);
690  return std::make_unique<const InitCap>(var_string_optional_literal);
691  }
693  CHECK_EQ(num_non_variable_literals, 0UL);
694  return std::make_unique<const Reverse>(var_string_optional_literal);
695  }
697  CHECK_EQ(num_non_variable_literals, 1UL);
698  const auto num_repeats_literal = string_op_info.getIntLiteral(1);
699  return std::make_unique<const Repeat>(var_string_optional_literal,
700  num_repeats_literal);
701  }
704  CHECK_EQ(num_non_variable_literals, 1UL);
705  const auto str_literal = string_op_info.getStringLiteral(1);
706  // Handle lhs literals by having RCONCAT operator set a flag
707  return std::make_unique<const Concat>(
708  var_string_optional_literal, str_literal, op_kind == SqlStringOpKind::RCONCAT);
709  }
711  case SqlStringOpKind::RPAD: {
712  CHECK_EQ(num_non_variable_literals, 2UL);
713  const auto padded_length_literal = string_op_info.getIntLiteral(1);
714  const auto padding_string_literal = string_op_info.getStringLiteral(2);
715  return std::make_unique<Pad>(var_string_optional_literal,
716  op_kind,
717  padded_length_literal,
718  padding_string_literal);
719  }
722  case SqlStringOpKind::RTRIM: {
723  CHECK_EQ(num_non_variable_literals, 1UL);
724  const auto trim_chars_literal = string_op_info.getStringLiteral(1);
725  return std::make_unique<Trim>(
726  var_string_optional_literal, op_kind, trim_chars_literal);
727  }
729  CHECK_GE(num_non_variable_literals, 1UL);
730  CHECK_LE(num_non_variable_literals, 2UL);
731  const auto start_pos_literal = string_op_info.getIntLiteral(1);
732  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(2);
733  if (has_length_literal) {
734  const auto length_literal = string_op_info.getIntLiteral(2);
735  return std::make_unique<const Substring>(
736  var_string_optional_literal, start_pos_literal, length_literal);
737  } else {
738  return std::make_unique<const Substring>(var_string_optional_literal,
739  start_pos_literal);
740  }
741  }
743  CHECK_GE(num_non_variable_literals, 2UL);
744  CHECK_LE(num_non_variable_literals, 3UL);
745  const auto replace_string_literal = string_op_info.getStringLiteral(1);
746  const auto start_pos_literal = string_op_info.getIntLiteral(2);
747  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(3);
748  if (has_length_literal) {
749  const auto length_literal = string_op_info.getIntLiteral(3);
750  return std::make_unique<const Overlay>(var_string_optional_literal,
751  replace_string_literal,
752  start_pos_literal,
753  length_literal);
754  } else {
755  return std::make_unique<const Overlay>(
756  var_string_optional_literal, replace_string_literal, start_pos_literal);
757  }
758  }
760  CHECK_GE(num_non_variable_literals, 2UL);
761  CHECK_LE(num_non_variable_literals, 2UL);
762  const auto pattern_string_literal = string_op_info.getStringLiteral(1);
763  const auto replacement_string_literal = string_op_info.getStringLiteral(2);
764  return std::make_unique<const Replace>(var_string_optional_literal,
765  pattern_string_literal,
766  replacement_string_literal);
767  }
769  CHECK_GE(num_non_variable_literals, 2UL);
770  CHECK_LE(num_non_variable_literals, 2UL);
771  const auto delimiter_literal = string_op_info.getStringLiteral(1);
772  const auto split_part_literal = string_op_info.getIntLiteral(2);
773  return std::make_unique<const SplitPart>(
774  var_string_optional_literal, delimiter_literal, split_part_literal);
775  }
777  CHECK_GE(num_non_variable_literals, 5UL);
778  CHECK_LE(num_non_variable_literals, 5UL);
779  const auto pattern_literal = string_op_info.getStringLiteral(1);
780  const auto replacement_literal = string_op_info.getStringLiteral(2);
781  const auto start_pos_literal = string_op_info.getIntLiteral(3);
782  const auto occurrence_literal = string_op_info.getIntLiteral(4);
783  const auto regex_params_literal = string_op_info.getStringLiteral(5);
784  return std::make_unique<const RegexpReplace>(var_string_optional_literal,
785  pattern_literal,
786  replacement_literal,
787  start_pos_literal,
788  occurrence_literal,
789  regex_params_literal);
790  }
792  CHECK_GE(num_non_variable_literals, 5UL);
793  CHECK_LE(num_non_variable_literals, 5UL);
794  const auto pattern_literal = string_op_info.getStringLiteral(1);
795  const auto start_pos_literal = string_op_info.getIntLiteral(2);
796  const auto occurrence_literal = string_op_info.getIntLiteral(3);
797  const auto regex_params_literal = string_op_info.getStringLiteral(4);
798  const auto sub_match_idx_literal = string_op_info.getIntLiteral(5);
799  return std::make_unique<const RegexpSubstr>(var_string_optional_literal,
800  pattern_literal,
801  start_pos_literal,
802  occurrence_literal,
803  regex_params_literal,
804  sub_match_idx_literal);
805  }
807  CHECK_EQ(num_non_variable_literals, 1UL);
808  const auto json_path_literal = string_op_info.getStringLiteral(1);
809  return std::make_unique<const JsonValue>(var_string_optional_literal,
810  json_path_literal);
811  }
813  CHECK_EQ(num_non_variable_literals, 0UL);
814  return std::make_unique<const Base64Encode>(var_string_optional_literal);
815  }
817  CHECK_EQ(num_non_variable_literals, 0UL);
818  return std::make_unique<const Base64Decode>(var_string_optional_literal);
819  }
821  CHECK_EQ(num_non_variable_literals, 0UL);
822  return std::make_unique<const TryStringCast>(return_ti,
823  var_string_optional_literal);
824  }
825  default: {
826  UNREACHABLE();
827  return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
828  }
829  }
830  // Make compiler happy
831  return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
832 }
833 
834 std::pair<std::string, bool /* is null */> apply_string_op_to_literals(
835  const StringOpInfo& string_op_info) {
836  CHECK(string_op_info.hasVarStringLiteral());
837  if (string_op_info.hasNullLiteralArg()) {
838  const std::string null_str{""};
839  return std::make_pair(null_str, true);
840  }
841  const auto string_op = gen_string_op(string_op_info);
842  return string_op->operator()().toPair();
843 }
844 
846  CHECK(string_op_info.hasVarStringLiteral());
847  const auto string_op = gen_string_op(string_op_info);
848  return string_op->numericEval();
849 }
850 
851 } // namespace StringOps_Namespace
#define CHECK_EQ(x, y)
Definition: Logger.h:230
const SQLTypeInfo & getReturnType() const
Definition: StringOpInfo.h:58
Datum apply_numeric_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:845
#define UNREACHABLE()
Definition: Logger.h:266
#define CHECK_GE(x, y)
Definition: Logger.h:235
SqlStringOpKind
Definition: sqldefs.h:84
bool intLiteralArgAtIdxExists(const size_t index) const
#define CHECK_GT(x, y)
Definition: Logger.h:234
std::string to_string(char const *&&v)
int64_t getIntLiteral(const size_t index) const
size_t numNonVariableLiterals() const
Definition: StringOpInfo.h:54
std::pair< std::string, bool > apply_string_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:834
Datum StringToDatum(const std::string_view s, SQLTypeInfo &ti)
Definition: Datum.cpp:337
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:296
Datum NullDatum(const SQLTypeInfo &ti)
Definition: Datum.cpp:286
#define CHECK_LE(x, y)
Definition: Logger.h:233
std::string getStringLiteral(const size_t index) const
#define CHECK(condition)
Definition: Logger.h:222
std::string decode_base64(const std::string &val, bool trim_nulls)
Definition: base64.h:27
static std::string encode_base64(const std::string &val)
Definition: base64.h:45
const SqlStringOpKind & getOpKind() const
Definition: StringOpInfo.h:42
Definition: Datum.h:44
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:664