OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOps.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "StringOps.h"
18 #include "Shared/base64.h"
19 
20 #include <rapidjson/document.h>
21 #include <boost/algorithm/string/predicate.hpp>
22 
23 namespace StringOps_Namespace {
24 
25 boost::regex StringOp::generateRegex(const std::string& op_name,
26  const std::string& regex_pattern,
27  const std::string& regex_params,
28  const bool supports_sub_matches) {
29  bool is_case_sensitive = false;
30  bool is_case_insensitive = false;
31 
32  for (const auto& c : regex_params) {
33  switch (c) {
34  case 'c':
35  is_case_sensitive = true;
36  break;
37  case 'i':
38  is_case_insensitive = true;
39  break;
40  case 'e': {
41  if (!supports_sub_matches) {
42  throw std::runtime_error(op_name +
43  " does not support 'e' (sub-matches) option.");
44  }
45  // We use e to set sub-expression group in a separate initializer
46  // but need to have this entry to not error on the default path
47  break;
48  }
49  default: {
50  if (supports_sub_matches) {
51  throw std::runtime_error("Unrecognized regex parameter for " + op_name +
52  ", expected either 'c' 'i', or 'e'.");
53  }
54  throw std::runtime_error("Unrecognized regex parameter for " + op_name +
55  ", expected either 'c' or 'i'.");
56  }
57  }
58  }
59  if (!is_case_sensitive && !is_case_insensitive) {
60  throw std::runtime_error(op_name +
61  " params must either specify case-sensitivity ('c') or "
62  "case-insensitivity ('i').");
63  }
64  if (is_case_sensitive && is_case_insensitive) {
65  throw std::runtime_error(op_name +
66  " params cannot specify both case-sensitivity ('c') and "
67  "case-insensitivity ('i').");
68  }
69  if (is_case_insensitive) {
70  return boost::regex(regex_pattern,
71  boost::regex_constants::extended |
72  boost::regex_constants::optimize |
73  boost::regex_constants::icase);
74  } else {
75  return boost::regex(
76  regex_pattern,
77  boost::regex_constants::extended | boost::regex_constants::optimize);
78  }
79 }
80 
81 NullableStrType TryStringCast::operator()(const std::string& str) const {
82  UNREACHABLE() << "Invalid string output for TryStringCast";
83  return NullableStrType();
84 }
85 
86 Datum TryStringCast::numericEval(const std::string_view str) const {
87  if (str.empty()) {
88  return NullDatum(return_ti_);
89  }
90  // Need to make copy for now b/c StringToDatum can mod SQLTypeInfo arg
91  SQLTypeInfo return_ti(return_ti_);
92  try {
93  return StringToDatum(str, return_ti);
94  } catch (std::runtime_error& e) {
95  return NullDatum(return_ti);
96  }
97 }
98 
99 NullableStrType Position::operator()(const std::string& str) const {
100  UNREACHABLE() << "Invalid string output for Position";
101  return {};
102 }
103 
104 Datum Position::numericEval(const std::string_view str) const {
105  if (str.empty()) {
106  return NullDatum(return_ti_);
107  } else {
108  const int64_t str_len = str.size();
109  const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
110  Datum return_datum;
111  const auto search_index = str.find(search_str_, wrapped_start);
112  if (search_index == std::string::npos) {
113  return_datum.bigintval = 0;
114  } else {
115  return_datum.bigintval = static_cast<int64_t>(search_index) + 1;
116  }
117  return return_datum;
118  }
119 }
120 
121 NullableStrType Lower::operator()(const std::string& str) const {
122  std::string output_str(str);
124  output_str.begin(), output_str.end(), output_str.begin(), [](unsigned char c) {
125  return std::tolower(c);
126  });
127  return output_str;
128 }
129 
130 NullableStrType Upper::operator()(const std::string& str) const {
131  std::string output_str(str);
133  output_str.begin(), output_str.end(), output_str.begin(), [](unsigned char c) {
134  return std::toupper(c);
135  });
136  return output_str;
137 }
138 
139 NullableStrType InitCap::operator()(const std::string& str) const {
140  std::string output_str(str);
141  bool last_char_whitespace = true; // Beginning of string counts as whitespace
142  for (auto& c : output_str) {
143  if (isspace(c) || delimiter_bitmap_[reinterpret_cast<const uint8_t&>(c)]) {
144  last_char_whitespace = true;
145  continue;
146  }
147  if (last_char_whitespace) {
148  c = toupper(c);
149  last_char_whitespace = false;
150  } else {
151  c = tolower(c);
152  }
153  }
154  return output_str;
155 }
156 
157 NullableStrType Reverse::operator()(const std::string& str) const {
158  const std::string reversed_str = std::string(str.rbegin(), str.rend());
159  return reversed_str;
160 }
161 
162 NullableStrType Repeat::operator()(const std::string& str) const {
163  std::string repeated_str;
164  repeated_str.reserve(str.size() * n_);
165  for (size_t r = 0; r < n_; ++r) {
166  repeated_str += str;
167  }
168  return repeated_str;
169 }
170 
171 NullableStrType Concat::operator()(const std::string& str) const {
172  return reverse_order_ ? str_literal_ + str : str + str_literal_;
173 }
174 
175 NullableStrType Concat::operator()(const std::string& str1,
176  const std::string& str2) const {
177  return str1 + str2;
178 }
179 
180 NullableStrType Pad::operator()(const std::string& str) const {
181  return pad_mode_ == Pad::PadMode::LEFT ? lpad(str) : rpad(str);
182 }
183 
184 std::string Pad::lpad(const std::string& str) const {
185  const auto str_len = str.size();
186  const size_t chars_to_fill = str_len < padded_length_ ? padded_length_ - str_len : 0UL;
187  if (chars_to_fill == 0UL) {
188  return str.substr(0, padded_length_);
189  }
190  // If here we need to add characters from the padding_string_
191  // to fill the difference between str_len and padded_length_
192  if (padding_string_length_ == 1UL) {
193  return std::string(chars_to_fill, padding_char_) + str;
194  }
195 
196  std::string fitted_padding_str;
197  fitted_padding_str.reserve(chars_to_fill);
198  for (size_t i = 0; i < chars_to_fill; ++i) {
199  fitted_padding_str.push_back(padding_string_[i % padding_string_length_]);
200  }
201  return fitted_padding_str + str;
202 }
203 
204 std::string Pad::rpad(const std::string& str) const {
205  const auto str_len = str.size();
206  const size_t chars_to_fill = str_len < padded_length_ ? padded_length_ - str_len : 0UL;
207  if (chars_to_fill == 0UL) {
208  return str.substr(str_len - padded_length_, std::string::npos);
209  }
210  // If here we need to add characters from the padding_string_
211  // to fill the difference between str_len and padded_length_
212  if (padding_string_length_ == 1UL) {
213  return str + std::string(chars_to_fill, padding_char_);
214  }
215 
216  std::string fitted_padding_str;
217  fitted_padding_str.reserve(chars_to_fill);
218  for (size_t i = 0; i < chars_to_fill; ++i) {
219  fitted_padding_str.push_back(padding_string_[i % padding_string_length_]);
220  }
221  return str + fitted_padding_str;
222 }
223 
224 Pad::PadMode Pad::op_kind_to_pad_mode(const SqlStringOpKind op_kind) {
225  switch (op_kind) {
227  return PadMode::LEFT;
229  return PadMode::RIGHT;
230  default:
231  UNREACHABLE();
232  // Not reachable, but make compiler happy
233  return PadMode::LEFT;
234  };
235 }
236 
237 NullableStrType Trim::operator()(const std::string& str) const {
238  const auto str_len = str.size();
239  size_t trim_begin = 0;
240  if (trim_mode_ == TrimMode::LEFT || trim_mode_ == TrimMode::BOTH) {
241  while (trim_begin < str_len &&
242  trim_char_bitmap_[reinterpret_cast<const uint8_t&>(str[trim_begin])]) {
243  ++trim_begin;
244  }
245  }
246  size_t trim_end = str_len - 1;
247  if (trim_mode_ == TrimMode::RIGHT || trim_mode_ == TrimMode::BOTH) {
248  while (trim_end > trim_begin &&
249  trim_char_bitmap_[reinterpret_cast<const uint8_t&>(str[trim_end])]) {
250  --trim_end;
251  }
252  }
253  if (trim_begin == 0 && trim_end == str_len - 1) {
254  return str;
255  }
256  return str.substr(trim_begin, trim_end + 1 - trim_begin);
257 }
258 
259 Trim::TrimMode Trim::op_kind_to_trim_mode(const SqlStringOpKind op_kind) {
260  switch (op_kind) {
262  return Trim::TrimMode::BOTH;
264  return Trim::TrimMode::LEFT;
266  return Trim::TrimMode::RIGHT;
267  default:
268  UNREACHABLE();
269  // Not reachable, but make compiler happy
270  return Trim::TrimMode::BOTH;
271  };
272 }
273 
274 NullableStrType Substring::operator()(const std::string& str) const {
275  // If start_ is negative then we start abs(start_) characters from the end
276  // of the string
277  const int64_t str_len = str.size();
278  const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
279  const size_t capped_start =
280  wrapped_start > str_len ? str_len : (wrapped_start < 0 ? 0 : wrapped_start);
281  return str.substr(capped_start, length_);
282 }
283 
284 NullableStrType Overlay::operator()(const std::string& base_str) const {
285  // If start_ is negative then we start abs(start_) characters from the end
286  // of the string
287  const int64_t str_len = base_str.size();
288  const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
289  const size_t capped_start =
290  wrapped_start > str_len ? str_len : (wrapped_start < 0 ? 0 : wrapped_start);
291  std::string replaced_str = base_str.substr(0, capped_start);
292  replaced_str += insert_str_;
293  const size_t remainder_start =
294  std::min(wrapped_start + replacement_length_, size_t(str_len));
295  const size_t remainder_length = static_cast<size_t>(str_len) - remainder_start;
296  replaced_str += base_str.substr(remainder_start, remainder_length);
297  return replaced_str;
298 }
299 
300 NullableStrType Replace::operator()(const std::string& str) const {
301  std::string replaced_str(str);
302 
303  size_t search_start_index = 0;
304  while (true) {
305  search_start_index = replaced_str.find(pattern_str_, search_start_index);
306  if (search_start_index == std::string::npos) {
307  break;
308  }
309  replaced_str.replace(search_start_index, pattern_str_len_, replacement_str_);
310  search_start_index += replacement_str_len_;
311  }
312  return replaced_str;
313 }
314 
315 NullableStrType SplitPart::operator()(const std::string& str) const {
316  // If split_part_ is negative then it is taken as the number
317  // of split parts from the end of the string
318 
319  if (delimiter_ == "") {
320  return str;
321  }
322 
323  const size_t str_len = str.size();
324  size_t delimiter_pos = reverse_ ? str_len : 0UL;
325  size_t last_delimiter_pos;
326  size_t delimiter_idx = 0UL;
327 
328  do {
329  last_delimiter_pos = delimiter_pos;
330  delimiter_pos = reverse_ ? str.rfind(delimiter_, delimiter_pos - 1UL)
331  : str.find(delimiter_, delimiter_pos + delimiter_length_);
332  } while (delimiter_pos != std::string::npos && ++delimiter_idx < split_part_);
333 
334  if (delimiter_idx == 0UL && split_part_ == 1UL) {
335  // No delimiter was found, but the first match is requested, which here is
336  // the whole string
337  return str;
338  }
339 
340  if (delimiter_pos == std::string::npos &&
341  (delimiter_idx < split_part_ - 1UL || delimiter_idx < 1UL)) {
342  // split_part_ was out of range
343  return NullableStrType(); // null string
344  }
345 
346  if (reverse_) {
347  const size_t substr_start =
348  delimiter_pos == std::string::npos ? 0UL : delimiter_pos + delimiter_length_;
349  return str.substr(substr_start, last_delimiter_pos - substr_start);
350  } else {
351  const size_t substr_start =
352  split_part_ == 1UL ? 0UL : last_delimiter_pos + delimiter_length_;
353  return str.substr(substr_start, delimiter_pos - substr_start);
354  }
355 }
356 
357 NullableStrType RegexpReplace::operator()(const std::string& str) const {
358  const int64_t str_len = str.size();
359  const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
360  const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
361  if (occurrence_ == 0L) {
362  std::string result;
363  std::string::const_iterator replace_start(str.cbegin() + wrapped_start);
364  boost::regex_replace(std::back_inserter(result),
365  replace_start,
366  str.cend(),
367  regex_pattern_,
368  replacement_);
369  return str.substr(0UL, wrapped_start) + result;
370  } else {
371  const auto occurrence_match_pos = RegexpReplace::get_nth_regex_match(
372  str,
373  wrapped_start,
374  regex_pattern_,
375  occurrence_ > 0 ? occurrence_ - 1 : occurrence_);
376  if (occurrence_match_pos.first == std::string::npos) {
377  // No match found, return original string
378  return str;
379  }
380  std::string result;
381  std::string::const_iterator replace_start(str.cbegin() + occurrence_match_pos.first);
382  std::string::const_iterator replace_end(str.cbegin() + occurrence_match_pos.second);
383  std::string replaced_match;
384  boost::regex_replace(std::back_inserter(replaced_match),
385  replace_start,
386  replace_end,
387  regex_pattern_,
388  replacement_);
389  return str.substr(0UL, occurrence_match_pos.first) + replaced_match +
390  str.substr(occurrence_match_pos.second, std::string::npos);
391  }
392 }
393 
394 std::pair<size_t, size_t> RegexpReplace::get_nth_regex_match(
395  const std::string& str,
396  const size_t start_pos,
397  const boost::regex& regex_pattern,
398  const int64_t occurrence) {
399  std::vector<std::pair<size_t, size_t>> regex_match_positions;
400  std::string::const_iterator search_start(str.cbegin() + start_pos);
401  boost::smatch match;
402  int64_t match_idx = 0;
403  size_t string_pos = start_pos;
404  while (boost::regex_search(search_start, str.cend(), match, regex_pattern)) {
405  string_pos += match.position(size_t(0)) + match.length(0);
406  regex_match_positions.emplace_back(
407  std::make_pair(string_pos - match.length(0), string_pos));
408  if (match_idx++ == occurrence) {
409  return regex_match_positions.back();
410  }
411  search_start =
412  match.suffix().first; // Move to position after last char of matched string
413  // Position is relative to last match/initial iterator, so need to increment our
414  // string_pos accordingly
415  }
416  // occurrence only could have a valid match if negative here,
417  // but don't want to check in inner loop for performance reasons
418  const int64_t wrapped_match = occurrence >= 0 ? occurrence : match_idx + occurrence;
419  if (wrapped_match < 0 || wrapped_match >= match_idx) {
420  // Represents a non-match
421  return std::make_pair(std::string::npos, std::string::npos);
422  }
423  return regex_match_positions[wrapped_match];
424 }
425 
426 NullableStrType RegexpSubstr::operator()(const std::string& str) const {
427  const int64_t str_len = str.size();
428  const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
429  const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
430  int64_t match_idx = 0;
431  // Apears std::regex_search does not support string_view?
432  std::vector<std::string> regex_matches;
433  std::string::const_iterator search_start(str.cbegin() + wrapped_start);
434  boost::smatch match;
435  while (boost::regex_search(search_start, str.cend(), match, regex_pattern_)) {
436  if (match_idx++ == occurrence_) {
437  if (sub_match_info_.first) {
438  return RegexpSubstr::get_sub_match(match, sub_match_info_);
439  }
440  return NullableStrType(match[0]);
441  }
442  regex_matches.emplace_back(match[0]);
443  search_start =
444  match.suffix().first; // Move to position after last char of matched string
445  }
446  const int64_t wrapped_match = occurrence_ >= 0 ? occurrence_ : match_idx + occurrence_;
447  if (wrapped_match < 0 || wrapped_match >= match_idx) {
448  return NullableStrType();
449  }
450  if (sub_match_info_.first) {
451  return RegexpSubstr::get_sub_match(match, sub_match_info_);
452  }
453  return regex_matches[wrapped_match];
454 }
455 
456 std::string RegexpSubstr::get_sub_match(const boost::smatch& match,
457  const std::pair<bool, int64_t> sub_match_info) {
458  const int64_t num_sub_matches = match.size() - 1;
459  const int64_t wrapped_sub_match = sub_match_info.second >= 0
460  ? sub_match_info.second
461  : num_sub_matches + sub_match_info.second;
462  if (wrapped_sub_match < 0 || wrapped_sub_match >= num_sub_matches) {
463  return "";
464  }
465  return match[wrapped_sub_match + 1];
466 }
467 
468 std::pair<bool, int64_t> RegexpSubstr::set_sub_match_info(
469  const std::string& regex_pattern,
470  const int64_t sub_match_group_idx) {
471  if (regex_pattern.find("e", 0UL) == std::string::npos) {
472  return std::make_pair(false, 0UL);
473  }
474  return std::make_pair(
475  true, sub_match_group_idx > 0L ? sub_match_group_idx - 1 : sub_match_group_idx);
476 }
477 
478 // json_path must start with "lax $", "strict $" or "$" (case-insensitive).
479 JsonValue::JsonParseMode JsonValue::parse_json_parse_mode(std::string_view json_path) {
480  size_t const string_pos = json_path.find('$');
481  if (string_pos == 0) {
482  // Parsing mode was not explicitly specified, default to PARSE_MODE_LAX
483  return JsonValue::JsonParseMode::PARSE_MODE_LAX;
484  } else if (string_pos == std::string::npos) {
485  throw std::runtime_error("JSON search path must include a '$' literal.");
486  }
487  std::string_view const prefix = json_path.substr(0, string_pos);
488  if (boost::iequals(prefix, std::string_view("lax "))) {
489  return JsonValue::JsonParseMode::PARSE_MODE_LAX;
490  } else if (boost::iequals(prefix, std::string_view("strict "))) {
491  if constexpr (JsonValue::allow_strict_json_parsing) {
492  return JsonValue::JsonParseMode::PARSE_MODE_STRICT;
493  } else {
494  throw std::runtime_error("Strict parsing not currently supported for JSON_VALUE.");
495  }
496  } else {
497  throw std::runtime_error("Issue parsing JSON_VALUE Parse Mode.");
498  }
499 }
500 
501 std::vector<JsonValue::JsonKey> JsonValue::parse_json_path(const std::string& json_path) {
502  // Assume that parse_key_error_mode validated strict/lax mode
503  size_t string_pos = json_path.find("$");
504  if (string_pos == std::string::npos) {
505  throw std::runtime_error("JSON search path must begin with '$' literal.");
506  }
507  string_pos += 1; // Go to next character after $
508 
509  // Use tildas to enclose escaped regex string due to embedded ')"'
510  static const auto& key_regex = *new boost::regex(
511  R"~(^(\.(([[:alpha:]][[:alnum:]_-]*)|"([[:alpha:]][ [:alnum:]_-]*)"))|\[([[:digit:]]+)\])~",
512  boost::regex_constants::extended | boost::regex_constants::optimize);
513  static_assert(std::is_trivially_destructible_v<decltype(key_regex)>);
514 
515  std::string::const_iterator search_start(json_path.cbegin() + string_pos);
516  boost::smatch match;
517  std::vector<JsonKey> json_keys;
518  while (boost::regex_search(search_start, json_path.cend(), match, key_regex)) {
519  CHECK_EQ(match.size(), 6UL);
520  if (match.position(size_t(0)) != 0L) {
521  // Match wasn't found at beginning of string
522  throw std::runtime_error("JSON search path parsing error: '" + json_path + "'");
523  }
524  size_t matching_expr = 0;
525  if (match[3].matched) {
526  // simple object key
527  matching_expr = 3;
528  } else if (match[4].matched) {
529  // complex object key
530  matching_expr = 4;
531  } else if (match[5].matched) {
532  // array key
533  matching_expr = 5;
534  }
535  CHECK_GT(matching_expr, 0UL);
536  string_pos += match.length(0);
537 
538  const std::string key_match(match[matching_expr].first, match[matching_expr].second);
539  CHECK_GE(key_match.length(), 1UL);
540  if (isalpha(key_match[0])) {
541  // Object key
542  json_keys.emplace_back(JsonKey(key_match));
543  } else {
544  // Array key
545  json_keys.emplace_back(JsonKey(std::stoi(key_match)));
546  }
547  search_start =
548  match.suffix().first; // Move to position after last char of matched string
549  }
550  if (json_keys.empty()) {
551  throw std::runtime_error("No keys found in JSON search path.");
552  }
553  if (string_pos < json_path.size()) {
554  throw std::runtime_error("JSON path parsing error.");
555  }
556  return json_keys;
557 }
558 
559 NullableStrType JsonValue::operator()(const std::string& str) const {
560  rapidjson::Document document;
561  if (document.Parse(str.c_str()).HasParseError()) {
562  if constexpr (JsonValue::allow_strict_json_parsing) {
563  return handle_parse_error(str);
564  } else {
565  return NullableStrType();
566  }
567  }
568  rapidjson::Value& json_val = document;
569  for (const auto& json_key : json_keys_) {
570  switch (json_key.key_kind) {
571  case JsonKeyKind::JSON_OBJECT: {
572  if (!json_val.IsObject() || !json_val.HasMember(json_key.object_key)) {
573  if constexpr (JsonValue::allow_strict_json_parsing) {
574  return handle_key_error(str);
575  } else {
576  return NullableStrType();
577  }
578  }
579  json_val = json_val[json_key.object_key];
580  break;
581  }
582  case JsonKeyKind::JSON_ARRAY: {
583  if (!json_val.IsArray() || json_val.Size() <= json_key.array_key) {
584  if constexpr (JsonValue::allow_strict_json_parsing) {
585  return handle_key_error(str);
586  } else {
587  return NullableStrType();
588  }
589  }
590  json_val = json_val[json_key.array_key];
591  break;
592  }
593  }
594  }
595  // Now get value as string
596  if (json_val.IsString()) {
597  return NullableStrType(std::string(json_val.GetString()));
598  } else if (json_val.IsNumber()) {
599  if (json_val.IsDouble()) {
600  return NullableStrType(std::to_string(json_val.GetDouble()));
601  } else if (json_val.IsInt64()) {
602  return NullableStrType(std::to_string(json_val.GetInt64()));
603  } else if (json_val.IsUint64()) {
604  // Need to cover range of uint64 that can't fit int in64
605  return NullableStrType(std::to_string(json_val.GetUint64()));
606  } else {
607  // A bit defensive, as I'm fairly sure json does not
608  // support numeric types with widths > 64 bits, so may drop
609  if constexpr (JsonValue::allow_strict_json_parsing) {
610  return handle_key_error(str);
611  } else {
612  return NullableStrType();
613  }
614  }
615  } else if (json_val.IsBool()) {
616  return NullableStrType(std::string(json_val.IsTrue() ? "true" : "false"));
617  } else if (json_val.IsNull()) {
618  return NullableStrType();
619  } else {
620  // For any unhandled type - we may move this to a CHECK after gaining
621  // more confidence in prod
622  if constexpr (JsonValue::allow_strict_json_parsing) {
623  return handle_key_error(str);
624  } else {
625  return NullableStrType();
626  }
627  }
628 }
629 
630 NullableStrType Base64Encode::operator()(const std::string& str) const {
631  return shared::encode_base64(str);
632 }
633 
634 NullableStrType Base64Decode::operator()(const std::string& str) const {
635  return shared::decode_base64(str);
636 }
637 
638 std::string StringOps::operator()(const std::string& str) const {
639  NullableStrType modified_str(str);
640  if (modified_str.is_null) {
641  return ""; // How we currently represent dictionary-encoded nulls
642  }
643  for (const auto& string_op : string_ops_) {
644  modified_str = string_op->operator()(modified_str.str);
645  if (modified_str.is_null) {
646  return ""; // How we currently represent dictionary-encoded nulls
647  }
648  }
649  return modified_str.str;
650 }
651 
652 std::string StringOps::multi_input_eval(const std::string_view str1,
653  const std::string_view str2) const {
654  NullableStrType modified_str1(str1);
655  NullableStrType modified_str2(str2);
656  if (modified_str1.is_null || modified_str2.is_null) {
657  return ""; // How we currently represent dictionary-encoded nulls
658  }
659  for (const auto& string_op : string_ops_) {
660  modified_str1 = string_op->operator()(modified_str1.str, modified_str2.str);
661  if (modified_str1.is_null) {
662  return ""; // How we currently represent dictionary-encoded nulls
663  }
664  }
665  return modified_str1.str;
666 }
667 
668 std::string_view StringOps::operator()(const std::string_view sv,
669  std::string& sv_storage) const {
670  sv_storage = sv;
671  NullableStrType nullable_str(sv);
672  for (const auto& string_op : string_ops_) {
673  nullable_str = string_op->operator()(nullable_str.str);
674  if (nullable_str.is_null) {
675  return "";
676  }
677  }
678  sv_storage = nullable_str.str;
679  return sv_storage;
680 }
681 
682 Datum StringOps::numericEval(const std::string_view str) const {
683  const auto num_string_producing_ops = string_ops_.size() - 1;
684  if (num_string_producing_ops == 0UL) {
685  // Short circuit and avoid transformation to string if
686  // only have one string->numeric op
687  return string_ops_.back()->numericEval(str);
688  }
689  NullableStrType modified_str(str);
690  for (size_t string_op_idx = 0; string_op_idx < num_string_producing_ops;
691  ++string_op_idx) {
692  const auto& string_op = string_ops_[string_op_idx];
693  modified_str = string_op->operator()(modified_str.str);
694  if (modified_str.is_null) {
695  break;
696  }
697  }
698  return string_ops_.back()->numericEval(modified_str.str);
699 }
700 
701 std::vector<std::unique_ptr<const StringOp>> StringOps::genStringOpsFromOpInfos(
702  const std::vector<StringOpInfo>& string_op_infos) const {
703  // Should we handle pure literal expressions here as well
704  // even though they are currently rewritten to string literals?
705  std::vector<std::unique_ptr<const StringOp>> string_ops;
706  string_ops.reserve(string_op_infos.size());
707  for (const auto& string_op_info : string_op_infos) {
708  string_ops.emplace_back(gen_string_op(string_op_info));
709  }
710  return string_ops;
711 }
712 
713 // Free functions follow
714 
715 std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info) {
716  std::optional<std::string> var_string_optional_literal;
717  const auto op_kind = string_op_info.getOpKind();
718  const auto& return_ti = string_op_info.getReturnType();
719 
720  if (string_op_info.hasNullLiteralArg()) {
721  return std::make_unique<const NullOp>(var_string_optional_literal, op_kind);
722  }
723 
724  const auto num_non_variable_literals = string_op_info.numNonVariableLiterals();
725  if (string_op_info.hasVarStringLiteral()) {
726  CHECK_EQ(num_non_variable_literals + 1UL, string_op_info.numLiterals());
727  var_string_optional_literal = string_op_info.getStringLiteral(0);
728  }
729 
730  switch (op_kind) {
731  case SqlStringOpKind::LOWER: {
732  CHECK_EQ(num_non_variable_literals, 0UL);
733  return std::make_unique<const Lower>(var_string_optional_literal);
734  }
735  case SqlStringOpKind::UPPER: {
736  CHECK_EQ(num_non_variable_literals, 0UL);
737  return std::make_unique<const Upper>(var_string_optional_literal);
738  }
740  CHECK_EQ(num_non_variable_literals, 0UL);
741  return std::make_unique<const InitCap>(var_string_optional_literal);
742  }
744  CHECK_EQ(num_non_variable_literals, 0UL);
745  return std::make_unique<const Reverse>(var_string_optional_literal);
746  }
748  CHECK_EQ(num_non_variable_literals, 1UL);
749  const auto num_repeats_literal = string_op_info.getIntLiteral(1);
750  return std::make_unique<const Repeat>(var_string_optional_literal,
751  num_repeats_literal);
752  }
755  CHECK_GE(num_non_variable_literals, 0UL);
756  CHECK_LE(num_non_variable_literals, 1UL);
757  if (num_non_variable_literals == 1UL) {
758  const auto str_literal = string_op_info.getStringLiteral(1);
759  // Handle lhs literals by having RCONCAT operator set a flag
760  return std::make_unique<const Concat>(var_string_optional_literal,
761  str_literal,
762  op_kind == SqlStringOpKind::RCONCAT);
763  } else {
764  return std::make_unique<const Concat>(var_string_optional_literal);
765  }
766  }
768  case SqlStringOpKind::RPAD: {
769  CHECK_EQ(num_non_variable_literals, 2UL);
770  const auto padded_length_literal = string_op_info.getIntLiteral(1);
771  const auto padding_string_literal = string_op_info.getStringLiteral(2);
772  return std::make_unique<Pad>(var_string_optional_literal,
773  op_kind,
774  padded_length_literal,
775  padding_string_literal);
776  }
779  case SqlStringOpKind::RTRIM: {
780  CHECK_EQ(num_non_variable_literals, 1UL);
781  const auto trim_chars_literal = string_op_info.getStringLiteral(1);
782  return std::make_unique<Trim>(
783  var_string_optional_literal, op_kind, trim_chars_literal);
784  }
786  CHECK_GE(num_non_variable_literals, 1UL);
787  CHECK_LE(num_non_variable_literals, 2UL);
788  const auto start_pos_literal = string_op_info.getIntLiteral(1);
789  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(2);
790  if (has_length_literal) {
791  const auto length_literal = string_op_info.getIntLiteral(2);
792  return std::make_unique<const Substring>(
793  var_string_optional_literal, start_pos_literal, length_literal);
794  } else {
795  return std::make_unique<const Substring>(var_string_optional_literal,
796  start_pos_literal);
797  }
798  }
800  CHECK_GE(num_non_variable_literals, 2UL);
801  CHECK_LE(num_non_variable_literals, 3UL);
802  const auto replace_string_literal = string_op_info.getStringLiteral(1);
803  const auto start_pos_literal = string_op_info.getIntLiteral(2);
804  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(3);
805  if (has_length_literal) {
806  const auto length_literal = string_op_info.getIntLiteral(3);
807  return std::make_unique<const Overlay>(var_string_optional_literal,
808  replace_string_literal,
809  start_pos_literal,
810  length_literal);
811  } else {
812  return std::make_unique<const Overlay>(
813  var_string_optional_literal, replace_string_literal, start_pos_literal);
814  }
815  }
817  CHECK_GE(num_non_variable_literals, 2UL);
818  CHECK_LE(num_non_variable_literals, 2UL);
819  const auto pattern_string_literal = string_op_info.getStringLiteral(1);
820  const auto replacement_string_literal = string_op_info.getStringLiteral(2);
821  return std::make_unique<const Replace>(var_string_optional_literal,
822  pattern_string_literal,
823  replacement_string_literal);
824  }
826  CHECK_GE(num_non_variable_literals, 2UL);
827  CHECK_LE(num_non_variable_literals, 2UL);
828  const auto delimiter_literal = string_op_info.getStringLiteral(1);
829  const auto split_part_literal = string_op_info.getIntLiteral(2);
830  return std::make_unique<const SplitPart>(
831  var_string_optional_literal, delimiter_literal, split_part_literal);
832  }
834  CHECK_GE(num_non_variable_literals, 5UL);
835  CHECK_LE(num_non_variable_literals, 5UL);
836  const auto pattern_literal = string_op_info.getStringLiteral(1);
837  const auto replacement_literal = string_op_info.getStringLiteral(2);
838  const auto start_pos_literal = string_op_info.getIntLiteral(3);
839  const auto occurrence_literal = string_op_info.getIntLiteral(4);
840  const auto regex_params_literal = string_op_info.getStringLiteral(5);
841  return std::make_unique<const RegexpReplace>(var_string_optional_literal,
842  pattern_literal,
843  replacement_literal,
844  start_pos_literal,
845  occurrence_literal,
846  regex_params_literal);
847  }
849  CHECK_GE(num_non_variable_literals, 5UL);
850  CHECK_LE(num_non_variable_literals, 5UL);
851  const auto pattern_literal = string_op_info.getStringLiteral(1);
852  const auto start_pos_literal = string_op_info.getIntLiteral(2);
853  const auto occurrence_literal = string_op_info.getIntLiteral(3);
854  const auto regex_params_literal = string_op_info.getStringLiteral(4);
855  const auto sub_match_idx_literal = string_op_info.getIntLiteral(5);
856  return std::make_unique<const RegexpSubstr>(var_string_optional_literal,
857  pattern_literal,
858  start_pos_literal,
859  occurrence_literal,
860  regex_params_literal,
861  sub_match_idx_literal);
862  }
864  CHECK_EQ(num_non_variable_literals, 1UL);
865  const auto json_path_literal = string_op_info.getStringLiteral(1);
866  return std::make_unique<const JsonValue>(var_string_optional_literal,
867  json_path_literal);
868  }
870  CHECK_EQ(num_non_variable_literals, 0UL);
871  return std::make_unique<const Base64Encode>(var_string_optional_literal);
872  }
874  CHECK_EQ(num_non_variable_literals, 0UL);
875  return std::make_unique<const Base64Decode>(var_string_optional_literal);
876  }
878  CHECK_EQ(num_non_variable_literals, 0UL);
879  return std::make_unique<const TryStringCast>(return_ti,
880  var_string_optional_literal);
881  }
883  CHECK_GE(num_non_variable_literals, 1UL);
884  CHECK_LE(num_non_variable_literals, 2UL);
885  const auto search_literal = string_op_info.getStringLiteral(1);
886  const bool has_start_pos_literal = string_op_info.intLiteralArgAtIdxExists(2);
887  if (has_start_pos_literal) {
888  const auto start_pos_literal = string_op_info.getIntLiteral(2);
889  return std::make_unique<const Position>(
890  var_string_optional_literal, search_literal, start_pos_literal);
891  } else {
892  return std::make_unique<const Position>(var_string_optional_literal,
893  search_literal);
894  }
895  }
896  default: {
897  UNREACHABLE();
898  return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
899  }
900  }
901  // Make compiler happy
902  return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
903 }
904 
905 std::pair<std::string, bool /* is null */> apply_string_op_to_literals(
906  const StringOpInfo& string_op_info) {
907  CHECK(string_op_info.hasVarStringLiteral());
908  if (string_op_info.hasNullLiteralArg()) {
909  const std::string null_str{""};
910  return std::make_pair(null_str, true);
911  }
912  const auto string_op = gen_string_op(string_op_info);
913  return string_op->operator()().toPair();
914 }
915 
917  CHECK(string_op_info.hasVarStringLiteral());
918  const auto string_op = gen_string_op(string_op_info);
919  return string_op->numericEval();
920 }
921 
922 } // namespace StringOps_Namespace
#define CHECK_EQ(x, y)
Definition: Logger.h:301
const SQLTypeInfo & getReturnType() const
Definition: StringOpInfo.h:58
Datum apply_numeric_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:916
#define UNREACHABLE()
Definition: Logger.h:337
#define CHECK_GE(x, y)
Definition: Logger.h:306
SqlStringOpKind
Definition: sqldefs.h:89
bool intLiteralArgAtIdxExists(const size_t index) const
#define CHECK_GT(x, y)
Definition: Logger.h:305
std::string to_string(char const *&&v)
int64_t getIntLiteral(const size_t index) const
size_t numNonVariableLiterals() const
Definition: StringOpInfo.h:54
int64_t bigintval
Definition: Datum.h:72
std::pair< std::string, bool > apply_string_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:905
Datum StringToDatum(const std::string_view s, SQLTypeInfo &ti)
Definition: Datum.cpp:337
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:320
Datum NullDatum(const SQLTypeInfo &ti)
Definition: Datum.cpp:286
#define CHECK_LE(x, y)
Definition: Logger.h:304
std::string getStringLiteral(const size_t index) const
#define CHECK(condition)
Definition: Logger.h:291
std::string decode_base64(const std::string &val, bool trim_nulls)
Definition: base64.h:27
static std::string encode_base64(const std::string &val)
Definition: base64.h:45
const SqlStringOpKind & getOpKind() const
Definition: StringOpInfo.h:42
Definition: Datum.h:67
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:715