18 namespace StringOps_Namespace {
20 boost::regex StringOp::generateRegex(
const std::string& op_name,
21 const std::string& regex_pattern,
22 const std::string& regex_params,
23 const bool supports_sub_matches) {
24 bool is_case_sensitive =
false;
25 bool is_case_insensitive =
false;
27 for (
const auto& c : regex_params) {
30 is_case_sensitive =
true;
33 is_case_insensitive =
true;
36 if (!supports_sub_matches) {
37 throw std::runtime_error(op_name +
38 " does not support 'e' (sub-matches) option.");
45 if (supports_sub_matches) {
46 throw std::runtime_error(
"Unrecognized regex parameter for " + op_name +
47 ", expected either 'c' 'i', or 'e'.");
49 throw std::runtime_error(
"Unrecognized regex parameter for " + op_name +
50 ", expected either 'c' or 'i'.");
54 if (!is_case_sensitive && !is_case_insensitive) {
55 throw std::runtime_error(op_name +
56 " params must either specify case-sensitivity ('c') or "
57 "case-insensitivity ('i').");
59 if (is_case_sensitive && is_case_insensitive) {
60 throw std::runtime_error(op_name +
61 " params cannot specify both case-sensitivity ('c') and "
62 "case-insensitivity ('i').");
64 if (is_case_insensitive) {
65 return boost::regex(regex_pattern,
66 boost::regex_constants::extended |
67 boost::regex_constants::optimize |
68 boost::regex_constants::icase);
72 boost::regex_constants::extended | boost::regex_constants::optimize);
76 NullableStrType Lower::operator()(
const std::string& str)
const {
77 std::string output_str(str);
79 output_str.begin(), output_str.end(), output_str.begin(), [](
unsigned char c) {
80 return std::tolower(c);
85 NullableStrType Upper::operator()(
const std::string& str)
const {
86 std::string output_str(str);
88 output_str.begin(), output_str.end(), output_str.begin(), [](
unsigned char c) {
89 return std::toupper(c);
94 NullableStrType InitCap::operator()(
const std::string& str)
const {
95 std::string output_str(str);
96 bool last_char_whitespace =
true;
97 for (
auto& c : output_str) {
98 if (isspace(c) || delimiter_bitmap_[reinterpret_cast<const uint8_t&>(c)]) {
99 last_char_whitespace =
true;
102 if (last_char_whitespace) {
104 last_char_whitespace =
false;
112 NullableStrType Reverse::operator()(
const std::string& str)
const {
113 const std::string reversed_str = std::string(str.rbegin(), str.rend());
117 NullableStrType Repeat::operator()(
const std::string& str)
const {
118 std::string repeated_str;
119 repeated_str.reserve(str.size() * n_);
120 for (
size_t r = 0; r < n_; ++r) {
126 NullableStrType Concat::operator()(
const std::string& str)
const {
127 return reverse_order_ ? str_literal_ + str : str + str_literal_;
130 NullableStrType Pad::operator()(
const std::string& str)
const {
131 return pad_mode_ == Pad::PadMode::LEFT ? lpad(str) : rpad(str);
134 std::string Pad::lpad(
const std::string& str)
const {
135 const auto str_len = str.size();
136 const size_t chars_to_fill = str_len < padded_length_ ? padded_length_ - str_len : 0UL;
137 if (chars_to_fill == 0UL) {
138 return str.substr(0, padded_length_);
142 if (padding_string_length_ == 1UL) {
143 return std::string(chars_to_fill, padding_char_) + str;
146 std::string fitted_padding_str;
147 fitted_padding_str.reserve(chars_to_fill);
148 for (
size_t i = 0; i < chars_to_fill; ++i) {
149 fitted_padding_str.push_back(padding_string_[i % padding_string_length_]);
151 return fitted_padding_str + str;
154 std::string Pad::rpad(
const std::string& str)
const {
155 const auto str_len = str.size();
156 const size_t chars_to_fill = str_len < padded_length_ ? padded_length_ - str_len : 0UL;
157 if (chars_to_fill == 0UL) {
158 return str.substr(str_len - padded_length_, std::string::npos);
162 if (padding_string_length_ == 1UL) {
163 return str + std::string(chars_to_fill, padding_char_);
166 std::string fitted_padding_str;
167 fitted_padding_str.reserve(chars_to_fill);
168 for (
size_t i = 0; i < chars_to_fill; ++i) {
169 fitted_padding_str.push_back(padding_string_[i % padding_string_length_]);
171 return str + fitted_padding_str;
177 return PadMode::LEFT;
179 return PadMode::RIGHT;
183 return PadMode::LEFT;
187 NullableStrType Trim::operator()(
const std::string& str)
const {
188 const auto str_len = str.size();
189 size_t trim_begin = 0;
190 if (trim_mode_ == TrimMode::LEFT || trim_mode_ == TrimMode::BOTH) {
191 while (trim_begin < str_len &&
192 trim_char_bitmap_[reinterpret_cast<const uint8_t&>(str[trim_begin])]) {
196 size_t trim_end = str_len - 1;
197 if (trim_mode_ == TrimMode::RIGHT || trim_mode_ == TrimMode::BOTH) {
198 while (trim_end > trim_begin &&
199 trim_char_bitmap_[reinterpret_cast<const uint8_t&>(str[trim_end])]) {
203 if (trim_begin == 0 && trim_end == str_len - 1) {
206 return str.substr(trim_begin, trim_end + 1 - trim_begin);
209 Trim::TrimMode Trim::op_kind_to_trim_mode(
const SqlStringOpKind op_kind) {
212 return Trim::TrimMode::BOTH;
214 return Trim::TrimMode::LEFT;
216 return Trim::TrimMode::RIGHT;
220 return Trim::TrimMode::BOTH;
224 NullableStrType Substring::operator()(
const std::string& str)
const {
227 const int64_t str_len = str.size();
228 const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
229 const size_t capped_start =
230 wrapped_start > str_len ? str_len : (wrapped_start < 0 ? 0 : wrapped_start);
231 return str.substr(capped_start, length_);
234 NullableStrType Overlay::operator()(
const std::string& base_str)
const {
237 const int64_t str_len = base_str.size();
238 const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
239 const size_t capped_start =
240 wrapped_start > str_len ? str_len : (wrapped_start < 0 ? 0 : wrapped_start);
241 std::string replaced_str = base_str.substr(0, capped_start);
242 replaced_str += insert_str_;
243 const size_t remainder_start =
244 std::min(wrapped_start + replacement_length_,
size_t(str_len));
245 const size_t remainder_length =
static_cast<size_t>(str_len) - remainder_start;
246 replaced_str += base_str.substr(remainder_start, remainder_length);
250 NullableStrType Replace::operator()(
const std::string& str)
const {
251 std::string replaced_str(str);
253 size_t search_start_index = 0;
255 search_start_index = replaced_str.find(pattern_str_, search_start_index);
256 if (search_start_index == std::string::npos) {
259 replaced_str.replace(search_start_index, pattern_str_len_, replacement_str_);
260 search_start_index += replacement_str_len_;
265 NullableStrType SplitPart::operator()(
const std::string& str)
const {
269 if (delimiter_ ==
"") {
273 const size_t str_len = str.size();
274 size_t delimiter_pos = reverse_ ? str_len : 0UL;
275 size_t last_delimiter_pos;
276 size_t delimiter_idx = 0UL;
279 last_delimiter_pos = delimiter_pos;
280 delimiter_pos = reverse_ ? str.rfind(delimiter_, delimiter_pos - 1UL)
281 : str.find(delimiter_, delimiter_pos + delimiter_length_);
282 }
while (delimiter_pos != std::string::npos && ++delimiter_idx < split_part_);
284 if (delimiter_idx == 0UL && split_part_ == 1UL) {
290 if (delimiter_pos == std::string::npos &&
291 (delimiter_idx < split_part_ - 1UL || delimiter_idx < 1UL)) {
293 return NullableStrType();
297 const size_t substr_start =
298 delimiter_pos == std::string::npos ? 0UL : delimiter_pos + delimiter_length_;
299 return str.substr(substr_start, last_delimiter_pos - substr_start);
301 const size_t substr_start =
302 split_part_ == 1UL ? 0UL : last_delimiter_pos + delimiter_length_;
303 return str.substr(substr_start, delimiter_pos - substr_start);
307 NullableStrType RegexpReplace::operator()(
const std::string& str)
const {
308 const int64_t str_len = str.size();
309 const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
310 const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
311 if (occurrence_ == 0L) {
313 std::string::const_iterator replace_start(str.cbegin() + wrapped_start);
314 boost::regex_replace(std::back_inserter(result),
319 return str.substr(0UL, wrapped_start) +
result;
321 const auto occurrence_match_pos = RegexpReplace::get_nth_regex_match(
325 occurrence_ > 0 ? occurrence_ - 1 : occurrence_);
326 if (occurrence_match_pos.first == std::string::npos) {
331 std::string::const_iterator replace_start(str.cbegin() + occurrence_match_pos.first);
332 std::string::const_iterator replace_end(str.cbegin() + occurrence_match_pos.second);
333 std::string replaced_match;
334 boost::regex_replace(std::back_inserter(replaced_match),
339 return str.substr(0UL, occurrence_match_pos.first) + replaced_match +
340 str.substr(occurrence_match_pos.second, std::string::npos);
344 std::pair<size_t, size_t> RegexpReplace::get_nth_regex_match(
345 const std::string& str,
346 const size_t start_pos,
347 const boost::regex& regex_pattern,
348 const int64_t occurrence) {
349 std::vector<std::pair<size_t, size_t>> regex_match_positions;
350 std::string::const_iterator search_start(str.cbegin() + start_pos);
352 int64_t match_idx = 0;
353 size_t string_pos = start_pos;
354 while (boost::regex_search(search_start, str.cend(), match, regex_pattern)) {
355 string_pos += match.position(
size_t(0)) + match.length(0);
356 regex_match_positions.emplace_back(
357 std::make_pair(string_pos - match.length(0), string_pos));
358 if (match_idx++ == occurrence) {
359 return regex_match_positions.back();
362 match.suffix().first;
368 const int64_t wrapped_match = occurrence >= 0 ? occurrence : match_idx + occurrence;
369 if (wrapped_match < 0 || wrapped_match >= match_idx) {
371 return std::make_pair(std::string::npos, std::string::npos);
373 return regex_match_positions[wrapped_match];
376 NullableStrType RegexpSubstr::operator()(
const std::string& str)
const {
377 const int64_t str_len = str.size();
378 const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
379 const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
380 int64_t match_idx = 0;
382 std::vector<std::string> regex_matches;
383 std::string::const_iterator search_start(str.cbegin() + wrapped_start);
385 while (boost::regex_search(search_start, str.cend(), match, regex_pattern_)) {
386 if (match_idx++ == occurrence_) {
387 if (sub_match_info_.first) {
388 return RegexpSubstr::get_sub_match(match, sub_match_info_);
390 return NullableStrType(match[0]);
392 regex_matches.emplace_back(match[0]);
394 match.suffix().first;
396 const int64_t wrapped_match = occurrence_ >= 0 ? occurrence_ : match_idx + occurrence_;
397 if (wrapped_match < 0 || wrapped_match >= match_idx) {
398 return NullableStrType();
400 if (sub_match_info_.first) {
401 return RegexpSubstr::get_sub_match(match, sub_match_info_);
403 return regex_matches[wrapped_match];
406 std::string RegexpSubstr::get_sub_match(
const boost::smatch& match,
407 const std::pair<bool, int64_t> sub_match_info) {
408 const int64_t num_sub_matches = match.size() - 1;
409 const int64_t wrapped_sub_match = sub_match_info.second >= 0
410 ? sub_match_info.second
411 : num_sub_matches + sub_match_info.second;
412 if (wrapped_sub_match < 0 || wrapped_sub_match >= num_sub_matches) {
415 return match[wrapped_sub_match + 1];
418 std::pair<bool, int64_t> RegexpSubstr::set_sub_match_info(
419 const std::string& regex_pattern,
420 const int64_t sub_match_group_idx) {
421 if (regex_pattern.find(
"e", 0UL) == std::string::npos) {
422 return std::make_pair(
false, 0UL);
424 return std::make_pair(
425 true, sub_match_group_idx > 0L ? sub_match_group_idx - 1 : sub_match_group_idx);
428 std::string StringOps::operator()(
const std::string& str)
const {
429 NullableStrType modified_str(str);
430 for (
const auto& string_op : string_ops_) {
431 modified_str = string_op->operator()(modified_str.str);
432 if (modified_str.is_null) {
436 return modified_str.str;
439 std::string_view StringOps::operator()(
const std::string_view& sv,
440 std::string& sv_storage)
const {
442 NullableStrType nullable_str(sv);
443 for (
const auto& string_op : string_ops_) {
444 nullable_str = string_op->operator()(nullable_str.str);
445 if (nullable_str.is_null) {
449 sv_storage = nullable_str.str;
453 std::vector<std::unique_ptr<const StringOp>> StringOps::genStringOpsFromOpInfos(
454 const std::vector<StringOpInfo>& string_op_infos)
const {
457 std::vector<std::unique_ptr<const StringOp>> string_ops;
458 string_ops.reserve(string_op_infos.size());
459 for (
const auto& string_op_info : string_op_infos) {
468 std::optional<std::string> var_string_optional_literal;
469 const auto op_kind = string_op_info.
getOpKind();
472 return std::make_unique<const NullOp>(var_string_optional_literal, op_kind);
483 CHECK_EQ(num_non_variable_literals, 0UL);
484 return std::make_unique<const Lower>(var_string_optional_literal);
487 CHECK_EQ(num_non_variable_literals, 0UL);
488 return std::make_unique<const Upper>(var_string_optional_literal);
491 CHECK_EQ(num_non_variable_literals, 0UL);
492 return std::make_unique<const InitCap>(var_string_optional_literal);
495 CHECK_EQ(num_non_variable_literals, 0UL);
496 return std::make_unique<const Reverse>(var_string_optional_literal);
499 CHECK_EQ(num_non_variable_literals, 1UL);
500 const auto num_repeats_literal = string_op_info.
getIntLiteral(1);
501 return std::make_unique<const Repeat>(var_string_optional_literal,
502 num_repeats_literal);
506 CHECK_EQ(num_non_variable_literals, 1UL);
509 return std::make_unique<const Concat>(
514 CHECK_EQ(num_non_variable_literals, 2UL);
515 const auto padded_length_literal = string_op_info.
getIntLiteral(1);
517 return std::make_unique<Pad>(var_string_optional_literal,
519 padded_length_literal,
520 padding_string_literal);
525 CHECK_EQ(num_non_variable_literals, 1UL);
527 return std::make_unique<Trim>(
528 var_string_optional_literal, op_kind, trim_chars_literal);
531 CHECK_GE(num_non_variable_literals, 1UL);
532 CHECK_LE(num_non_variable_literals, 2UL);
533 const auto start_pos_literal = string_op_info.
getIntLiteral(1);
535 if (has_length_literal) {
537 return std::make_unique<const Substring>(
538 var_string_optional_literal, start_pos_literal, length_literal);
540 return std::make_unique<const Substring>(var_string_optional_literal,
545 CHECK_GE(num_non_variable_literals, 2UL);
546 CHECK_LE(num_non_variable_literals, 3UL);
548 const auto start_pos_literal = string_op_info.
getIntLiteral(2);
550 if (has_length_literal) {
552 return std::make_unique<const Overlay>(var_string_optional_literal,
553 replace_string_literal,
557 return std::make_unique<const Overlay>(
558 var_string_optional_literal, replace_string_literal, start_pos_literal);
562 CHECK_GE(num_non_variable_literals, 2UL);
563 CHECK_LE(num_non_variable_literals, 2UL);
566 return std::make_unique<const Replace>(var_string_optional_literal,
567 pattern_string_literal,
568 replacement_string_literal);
571 CHECK_GE(num_non_variable_literals, 2UL);
572 CHECK_LE(num_non_variable_literals, 2UL);
574 const auto split_part_literal = string_op_info.
getIntLiteral(2);
575 return std::make_unique<const SplitPart>(
576 var_string_optional_literal, delimiter_literal, split_part_literal);
579 CHECK_GE(num_non_variable_literals, 5UL);
580 CHECK_LE(num_non_variable_literals, 5UL);
583 const auto start_pos_literal = string_op_info.
getIntLiteral(3);
584 const auto occurrence_literal = string_op_info.
getIntLiteral(4);
586 return std::make_unique<const RegexpReplace>(var_string_optional_literal,
591 regex_params_literal);
594 CHECK_GE(num_non_variable_literals, 5UL);
595 CHECK_LE(num_non_variable_literals, 5UL);
597 const auto start_pos_literal = string_op_info.
getIntLiteral(2);
598 const auto occurrence_literal = string_op_info.
getIntLiteral(3);
600 const auto sub_match_idx_literal = string_op_info.
getIntLiteral(5);
601 return std::make_unique<const RegexpSubstr>(var_string_optional_literal,
605 regex_params_literal,
606 sub_match_idx_literal);
610 return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
614 return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
621 const std::string null_str{
""};
622 return std::make_pair(null_str,
true);
625 return string_op->operator()().toPair();
size_t numLiterals() const
bool intLiteralArgAtIdxExists(const size_t index) const
int64_t getIntLiteral(const size_t index) const
size_t numNonVariableLiterals() const
bool hasNullLiteralArg() const
bool hasVarStringLiteral() const
std::pair< std::string, bool > apply_string_op_to_literals(const StringOpInfo &string_op_info)
OUTPUT transform(INPUT const &input, FUNC const &func)
std::string getStringLiteral(const size_t index) const
const SqlStringOpKind & getOpKind() const
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)