OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOps_Namespace Namespace Reference

Classes

struct  StringOpInfo
 

Typedefs

using LiteralArgMap = std::map< size_t, std::pair< SQLTypes, Datum >>
 

Functions

std::ostream & operator<< (std::ostream &stream, const StringOpInfo &string_op_info)
 
std::ostream & operator<< (std::ostream &stream, const std::vector< StringOpInfo > &string_op_infos)
 
std::string toString (const std::vector< StringOpInfo > &string_op_infos)
 
double compute_jaro_score (std::string_view s1, std::string_view s2)
 
double compute_jaro_winkler_score (std::string_view s1, std::string_view s2)
 
template<typename T >
compute_levenshtein_distance_template (std::string_view s1, std::string_view s2)
 
int64_t compute_levenshtein_distance (std::string_view s1, std::string_view s2)
 
std::unique_ptr< const StringOp > gen_string_op (const StringOpInfo &string_op_info)
 
std::pair< std::string, bool > apply_string_op_to_literals (const StringOpInfo &string_op_info)
 
Datum apply_numeric_op_to_literals (const StringOpInfo &string_op_info)
 

Variables

constexpr int winkler_k_prefix_length = 4
 
constexpr double winkler_k_scaling_factor = 0.1
 

Typedef Documentation

using StringOps_Namespace::LiteralArgMap = typedef std::map<size_t, std::pair<SQLTypes, Datum>>

Definition at line 30 of file StringOpInfo.h.

Function Documentation

Datum StringOps_Namespace::apply_numeric_op_to_literals ( const StringOpInfo &  string_op_info)

Definition at line 1145 of file StringOps.cpp.

References CHECK, gen_string_op(), and StringOps_Namespace::StringOpInfo::hasVarStringLiteral().

Referenced by anonymous_namespace{ExpressionRewrite.cpp}::ConstantFoldingVisitor::visitStringOper().

1145  {
1146  CHECK(string_op_info.hasVarStringLiteral());
1147  const auto string_op = gen_string_op(string_op_info);
1148  return string_op->numericEval();
1149 }
#define CHECK(condition)
Definition: Logger.h:291
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:922

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::pair<std::string, bool > StringOps_Namespace::apply_string_op_to_literals ( const StringOpInfo &  string_op_info)

Definition at line 1134 of file StringOps.cpp.

References CHECK, gen_string_op(), StringOps_Namespace::StringOpInfo::hasNullLiteralArg(), and StringOps_Namespace::StringOpInfo::hasVarStringLiteral().

Referenced by TransientStringLiteralsVisitor::visitStringOper(), and anonymous_namespace{ExpressionRewrite.cpp}::ConstantFoldingVisitor::visitStringOper().

1135  {
1136  CHECK(string_op_info.hasVarStringLiteral());
1137  if (string_op_info.hasNullLiteralArg()) {
1138  const std::string null_str{""};
1139  return std::make_pair(null_str, true);
1140  }
1141  const auto string_op = gen_string_op(string_op_info);
1142  return string_op->operator()().toPair();
1143 }
#define CHECK(condition)
Definition: Logger.h:291
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:922

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

double StringOps_Namespace::compute_jaro_score ( std::string_view  s1,
std::string_view  s2 
)

Definition at line 127 of file StringOps.cpp.

References score.

Referenced by compute_jaro_winkler_score().

127  {
128  int s1_len = s1.size();
129  int s2_len = s2.size();
130 
131  if (s1_len == 0 || s2_len == 0) {
132  return 0.0;
133  }
134 
135  int match_distance = std::max(s1_len, s2_len) / 2 - 1;
136  std::vector<bool> s1_match(s1_len, false);
137  std::vector<bool> s2_match(s2_len, false);
138 
139  int matches = 0;
140  int transpositions = 0;
141 
142  for (int i = 0; i < s1_len; ++i) {
143  int start = std::max(0, i - match_distance);
144  int end = std::min(i + match_distance + 1, s2_len);
145 
146  for (int j = start; j < end; ++j) {
147  if (s2_match[j]) {
148  continue;
149  }
150  if (s1[i] != s2[j]) {
151  continue;
152  }
153  s1_match[i] = true;
154  s2_match[j] = true;
155  ++matches;
156  break;
157  }
158  }
159 
160  if (matches == 0) {
161  return 0.0;
162  }
163 
164  int k = 0;
165  for (int i = 0; i < s1_len; ++i) {
166  if (!s1_match[i]) {
167  continue;
168  }
169  while (!s2_match[k]) {
170  ++k;
171  }
172  if (s1[i] != s2[k]) {
173  ++transpositions;
174  }
175  ++k;
176  }
177 
178  double score = ((matches / (double)s1_len) + (matches / (double)s2_len) +
179  ((matches - transpositions / 2.0) / matches)) /
180  3.0;
181 
182  return score;
183 }

+ Here is the caller graph for this function:

double StringOps_Namespace::compute_jaro_winkler_score ( std::string_view  s1,
std::string_view  s2 
)

Definition at line 185 of file StringOps.cpp.

References Datum::bigintval, compute_jaro_score(), anonymous_namespace{Utm.h}::n, NullDatum(), UNREACHABLE, winkler_k_prefix_length, and winkler_k_scaling_factor.

185  {
186  double jaro_score = compute_jaro_score(s1, s2);
187 
188  int l = 0;
189  int n = std::min({static_cast<int>(s1.size()),
190  static_cast<int>(s2.size()),
192 
193  for (; l < n; ++l) {
194  if (s1[l] != s2[l]) {
195  break;
196  }
197  }
198 
199  double winkler_adjustment = l * winkler_k_scaling_factor * (1 - jaro_score);
200  double jaro_winkler_score = jaro_score + winkler_adjustment;
201 
202  return jaro_winkler_score * 100;
203 }
constexpr int winkler_k_prefix_length
Definition: StringOps.cpp:122
double compute_jaro_score(std::string_view s1, std::string_view s2)
Definition: StringOps.cpp:127
constexpr double n
Definition: Utm.h:38
constexpr double winkler_k_scaling_factor
Definition: StringOps.cpp:125

+ Here is the call graph for this function:

int64_t StringOps_Namespace::compute_levenshtein_distance ( std::string_view  s1,
std::string_view  s2 
)

Definition at line 278 of file StringOps.cpp.

278  {
279  const size_t max_len = std::max(s1.size(), s2.size());
280 
281  if (max_len < 256) {
282  return compute_levenshtein_distance_template<uint8_t>(s1, s2);
283  } else if (max_len < 65536) {
284  return compute_levenshtein_distance_template<uint16_t>(s1, s2);
285  } else if (max_len < std::numeric_limits<uint32_t>::max()) {
286  return compute_levenshtein_distance_template<uint32_t>(s1, s2);
287  } else {
288  return compute_levenshtein_distance_template<uint64_t>(s1, s2);
289  }
290 }
template<typename T >
T StringOps_Namespace::compute_levenshtein_distance_template ( std::string_view  s1,
std::string_view  s2 
)

Definition at line 255 of file StringOps.cpp.

255  {
256  const size_t len1 = s1.size(), len2 = s2.size();
257  std::vector<std::vector<T>> d(len1 + 1, std::vector<T>(len2 + 1));
258 
259  d[0][0] = 0;
260  for (size_t i = 1; i <= len1; ++i) {
261  d[i][0] = i;
262  }
263  for (size_t i = 1; i <= len2; ++i) {
264  d[0][i] = i;
265  }
266 
267  for (size_t i = 1; i <= len1; ++i) {
268  for (size_t j = 1; j <= len2; ++j) {
269  d[i][j] = std::min({d[i - 1][j] + 1,
270  d[i][j - 1] + 1,
271  d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1)});
272  }
273  }
274 
275  return d[len1][len2];
276 }
std::unique_ptr<const StringOp> StringOps_Namespace::gen_string_op ( const StringOpInfo &  string_op_info)

Definition at line 922 of file StringOps.cpp.

References BASE64_DECODE, BASE64_ENCODE, CHECK_EQ, CHECK_GE, CHECK_LE, CONCAT, StringOps_Namespace::StringOpInfo::getIntLiteral(), StringOps_Namespace::StringOpInfo::getOpKind(), StringOps_Namespace::StringOpInfo::getReturnType(), StringOps_Namespace::StringOpInfo::getStringLiteral(), StringOps_Namespace::StringOpInfo::hasNullLiteralArg(), StringOps_Namespace::StringOpInfo::hasVarStringLiteral(), INITCAP, StringOps_Namespace::StringOpInfo::intLiteralArgAtIdxExists(), JAROWINKLER_SIMILARITY, JSON_VALUE, LEVENSHTEIN_DISTANCE, LOWER, LPAD, LTRIM, StringOps_Namespace::StringOpInfo::numLiterals(), StringOps_Namespace::StringOpInfo::numNonVariableLiterals(), OVERLAY, POSITION, RCONCAT, REGEXP_REPLACE, REGEXP_SUBSTR, REPEAT, REPLACE, REVERSE, RPAD, RTRIM, SPLIT_PART, SUBSTRING, TRIM, TRY_STRING_CAST, UNREACHABLE, and UPPER.

Referenced by apply_numeric_op_to_literals(), and apply_string_op_to_literals().

922  {
923  std::optional<std::string> var_string_optional_literal;
924  const auto op_kind = string_op_info.getOpKind();
925  const auto& return_ti = string_op_info.getReturnType();
926 
927  if (string_op_info.hasNullLiteralArg()) {
928  return std::make_unique<const NullOp>(var_string_optional_literal, op_kind);
929  }
930 
931  const auto num_non_variable_literals = string_op_info.numNonVariableLiterals();
932  if (string_op_info.hasVarStringLiteral()) {
933  CHECK_EQ(num_non_variable_literals + 1UL, string_op_info.numLiterals());
934  var_string_optional_literal = string_op_info.getStringLiteral(0);
935  }
936 
937  switch (op_kind) {
938  case SqlStringOpKind::LOWER: {
939  CHECK_EQ(num_non_variable_literals, 0UL);
940  return std::make_unique<const Lower>(var_string_optional_literal);
941  }
942  case SqlStringOpKind::UPPER: {
943  CHECK_EQ(num_non_variable_literals, 0UL);
944  return std::make_unique<const Upper>(var_string_optional_literal);
945  }
947  CHECK_EQ(num_non_variable_literals, 0UL);
948  return std::make_unique<const InitCap>(var_string_optional_literal);
949  }
951  CHECK_EQ(num_non_variable_literals, 0UL);
952  return std::make_unique<const Reverse>(var_string_optional_literal);
953  }
955  CHECK_EQ(num_non_variable_literals, 1UL);
956  const auto num_repeats_literal = string_op_info.getIntLiteral(1);
957  return std::make_unique<const Repeat>(var_string_optional_literal,
958  num_repeats_literal);
959  }
962  CHECK_GE(num_non_variable_literals, 0UL);
963  CHECK_LE(num_non_variable_literals, 1UL);
964  if (num_non_variable_literals == 1UL) {
965  const auto str_literal = string_op_info.getStringLiteral(1);
966  // Handle lhs literals by having RCONCAT operator set a flag
967  return std::make_unique<const Concat>(var_string_optional_literal,
968  str_literal,
969  op_kind == SqlStringOpKind::RCONCAT);
970  } else {
971  return std::make_unique<const Concat>(var_string_optional_literal);
972  }
973  }
975  case SqlStringOpKind::RPAD: {
976  CHECK_EQ(num_non_variable_literals, 2UL);
977  const auto padded_length_literal = string_op_info.getIntLiteral(1);
978  const auto padding_string_literal = string_op_info.getStringLiteral(2);
979  return std::make_unique<Pad>(var_string_optional_literal,
980  op_kind,
981  padded_length_literal,
982  padding_string_literal);
983  }
986  case SqlStringOpKind::RTRIM: {
987  CHECK_EQ(num_non_variable_literals, 1UL);
988  const auto trim_chars_literal = string_op_info.getStringLiteral(1);
989  return std::make_unique<Trim>(
990  var_string_optional_literal, op_kind, trim_chars_literal);
991  }
993  CHECK_GE(num_non_variable_literals, 1UL);
994  CHECK_LE(num_non_variable_literals, 2UL);
995  const auto start_pos_literal = string_op_info.getIntLiteral(1);
996  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(2);
997  if (has_length_literal) {
998  const auto length_literal = string_op_info.getIntLiteral(2);
999  return std::make_unique<const Substring>(
1000  var_string_optional_literal, start_pos_literal, length_literal);
1001  } else {
1002  return std::make_unique<const Substring>(var_string_optional_literal,
1003  start_pos_literal);
1004  }
1005  }
1006  case SqlStringOpKind::OVERLAY: {
1007  CHECK_GE(num_non_variable_literals, 2UL);
1008  CHECK_LE(num_non_variable_literals, 3UL);
1009  const auto replace_string_literal = string_op_info.getStringLiteral(1);
1010  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1011  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(3);
1012  if (has_length_literal) {
1013  const auto length_literal = string_op_info.getIntLiteral(3);
1014  return std::make_unique<const Overlay>(var_string_optional_literal,
1015  replace_string_literal,
1016  start_pos_literal,
1017  length_literal);
1018  } else {
1019  return std::make_unique<const Overlay>(
1020  var_string_optional_literal, replace_string_literal, start_pos_literal);
1021  }
1022  }
1023  case SqlStringOpKind::REPLACE: {
1024  CHECK_GE(num_non_variable_literals, 2UL);
1025  CHECK_LE(num_non_variable_literals, 2UL);
1026  const auto pattern_string_literal = string_op_info.getStringLiteral(1);
1027  const auto replacement_string_literal = string_op_info.getStringLiteral(2);
1028  return std::make_unique<const Replace>(var_string_optional_literal,
1029  pattern_string_literal,
1030  replacement_string_literal);
1031  }
1033  CHECK_GE(num_non_variable_literals, 2UL);
1034  CHECK_LE(num_non_variable_literals, 2UL);
1035  const auto delimiter_literal = string_op_info.getStringLiteral(1);
1036  const auto split_part_literal = string_op_info.getIntLiteral(2);
1037  return std::make_unique<const SplitPart>(
1038  var_string_optional_literal, delimiter_literal, split_part_literal);
1039  }
1041  CHECK_GE(num_non_variable_literals, 5UL);
1042  CHECK_LE(num_non_variable_literals, 5UL);
1043  const auto pattern_literal = string_op_info.getStringLiteral(1);
1044  const auto replacement_literal = string_op_info.getStringLiteral(2);
1045  const auto start_pos_literal = string_op_info.getIntLiteral(3);
1046  const auto occurrence_literal = string_op_info.getIntLiteral(4);
1047  const auto regex_params_literal = string_op_info.getStringLiteral(5);
1048  return std::make_unique<const RegexpReplace>(var_string_optional_literal,
1049  pattern_literal,
1050  replacement_literal,
1051  start_pos_literal,
1052  occurrence_literal,
1053  regex_params_literal);
1054  }
1056  CHECK_GE(num_non_variable_literals, 5UL);
1057  CHECK_LE(num_non_variable_literals, 5UL);
1058  const auto pattern_literal = string_op_info.getStringLiteral(1);
1059  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1060  const auto occurrence_literal = string_op_info.getIntLiteral(3);
1061  const auto regex_params_literal = string_op_info.getStringLiteral(4);
1062  const auto sub_match_idx_literal = string_op_info.getIntLiteral(5);
1063  return std::make_unique<const RegexpSubstr>(var_string_optional_literal,
1064  pattern_literal,
1065  start_pos_literal,
1066  occurrence_literal,
1067  regex_params_literal,
1068  sub_match_idx_literal);
1069  }
1071  CHECK_EQ(num_non_variable_literals, 1UL);
1072  const auto json_path_literal = string_op_info.getStringLiteral(1);
1073  return std::make_unique<const JsonValue>(var_string_optional_literal,
1074  json_path_literal);
1075  }
1077  CHECK_EQ(num_non_variable_literals, 0UL);
1078  return std::make_unique<const Base64Encode>(var_string_optional_literal);
1079  }
1081  CHECK_EQ(num_non_variable_literals, 0UL);
1082  return std::make_unique<const Base64Decode>(var_string_optional_literal);
1083  }
1085  CHECK_EQ(num_non_variable_literals, 0UL);
1086  return std::make_unique<const TryStringCast>(return_ti,
1087  var_string_optional_literal);
1088  }
1090  CHECK_GE(num_non_variable_literals, 1UL);
1091  CHECK_LE(num_non_variable_literals, 2UL);
1092  const auto search_literal = string_op_info.getStringLiteral(1);
1093  const bool has_start_pos_literal = string_op_info.intLiteralArgAtIdxExists(2);
1094  if (has_start_pos_literal) {
1095  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1096  return std::make_unique<const Position>(
1097  var_string_optional_literal, search_literal, start_pos_literal);
1098  } else {
1099  return std::make_unique<const Position>(var_string_optional_literal,
1100  search_literal);
1101  }
1102  }
1104  CHECK_GE(num_non_variable_literals, 0UL);
1105  CHECK_LE(num_non_variable_literals, 1UL);
1106  if (num_non_variable_literals == 1UL) {
1107  const auto str_literal = string_op_info.getStringLiteral(1);
1108  return std::make_unique<const JarowinklerSimilarity>(var_string_optional_literal,
1109  str_literal);
1110  } else {
1111  return std::make_unique<const JarowinklerSimilarity>(var_string_optional_literal);
1112  }
1113  }
1115  CHECK_GE(num_non_variable_literals, 0UL);
1116  CHECK_LE(num_non_variable_literals, 1UL);
1117  if (num_non_variable_literals == 1UL) {
1118  const auto str_literal = string_op_info.getStringLiteral(1);
1119  return std::make_unique<const LevenshteinDistance>(var_string_optional_literal,
1120  str_literal);
1121  } else {
1122  return std::make_unique<const LevenshteinDistance>(var_string_optional_literal);
1123  }
1124  }
1125  default: {
1126  UNREACHABLE();
1127  return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
1128  }
1129  }
1130  // Make compiler happy
1131  return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
1132 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK_GE(x, y)
Definition: Logger.h:306
#define CHECK_LE(x, y)
Definition: Logger.h:304

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::ostream& StringOps_Namespace::operator<< ( std::ostream &  stream,
const StringOpInfo &  string_op_info 
)

Definition at line 24 of file StringOpInfo.cpp.

References CHECK, extract_int_type_from_datum(), SQLTypeInfo::get_dimension(), SQLTypeInfo::get_scale(), SQLTypeInfo::get_type(), StringOps_Namespace::StringOpInfo::getOpKind(), StringOps_Namespace::StringOpInfo::getReturnType(), IS_INTEGER, IS_STRING, StringOps_Namespace::StringOpInfo::isLiteralArgNull(), StringOps_Namespace::StringOpInfo::literal_arg_map_, and toString().

24  {
25  stream << "StringOp("
26  << "operator: " << string_op_info.getOpKind()
27  << "return_ti: " << toString(string_op_info.getReturnType().get_type())
28  << " dim: " << string_op_info.getReturnType().get_dimension()
29  << " scale: " << string_op_info.getReturnType().get_scale() << ", literals: [";
30  bool first_elem = true;
31  for (const auto& literal_arg : string_op_info.literal_arg_map_) {
32  if (!first_elem) {
33  stream << ", ";
34  }
35  first_elem = false;
36  const auto datum_type = literal_arg.second.first;
37  const auto& datum = literal_arg.second.second;
38  stream << "{slot: " << literal_arg.first /* slot/idx */ << ", type: "
39  << ::toString(datum_type) << ", value: ";
40  if (string_op_info.isLiteralArgNull(datum_type, literal_arg.second.second)) {
41  stream << "NULL";
42  } else if (IS_STRING(datum_type)) {
43  stream << *datum.stringval;
44  } else {
45  CHECK(IS_INTEGER(datum_type));
46  const SQLTypeInfo ti(datum_type, false);
47  stream << extract_int_type_from_datum(datum, ti);
48  }
49  stream << "}";
50  }
51  stream << "]";
52  return stream;
53 }
std::string toString(const QueryDescriptionType &type)
Definition: Types.h:64
int64_t extract_int_type_from_datum(const Datum datum, const SQLTypeInfo &ti)
Definition: Datum.cpp:523
#define IS_INTEGER(T)
Definition: sqltypes.h:304
#define IS_STRING(T)
Definition: sqltypes.h:309
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

std::ostream & StringOps_Namespace::operator<< ( std::ostream &  stream,
const std::vector< StringOpInfo > &  string_op_infos 
)

Definition at line 55 of file StringOpInfo.cpp.

56  {
57  stream << "[";
58  bool first_elem = true;
59  for (const auto& string_op_info : string_op_infos) {
60  if (!first_elem) {
61  stream << ", ";
62  }
63  first_elem = false;
64  stream << string_op_info;
65  }
66  stream << "]";
67  return stream;
68 }
std::string StringOps_Namespace::toString ( const std::vector< StringOpInfo > &  string_op_infos)

Definition at line 70 of file StringOpInfo.cpp.

Referenced by operator<<().

70  {
71  std::ostringstream oss;
72  oss << string_op_infos;
73  return oss.str();
74 }

+ Here is the caller graph for this function:

Variable Documentation

constexpr int StringOps_Namespace::winkler_k_prefix_length = 4

Definition at line 122 of file StringOps.cpp.

Referenced by compute_jaro_winkler_score().

constexpr double StringOps_Namespace::winkler_k_scaling_factor = 0.1

Definition at line 125 of file StringOps.cpp.

Referenced by compute_jaro_winkler_score().