OmniSciDB  0fdbebe030
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringTransform.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "StringTransform.h"
18 
19 #include <numeric>
20 #include <random>
21 #include <regex>
22 
23 void apply_shim(std::string& result,
24  const boost::regex& reg_expr,
25  const std::function<void(std::string&, const boost::smatch&)>& shim_fn) {
26  boost::smatch what;
27  std::vector<std::pair<size_t, size_t>> lit_pos = find_string_literals(result);
28  auto start_it = result.cbegin();
29  auto end_it = result.cend();
30  while (true) {
31  if (!boost::regex_search(start_it, end_it, what, reg_expr)) {
32  break;
33  }
34  const auto next_start =
35  inside_string_literal(what.position(), what.length(), lit_pos);
36  if (next_start >= 0) {
37  start_it = result.cbegin() + next_start;
38  } else {
39  shim_fn(result, what);
40  lit_pos = find_string_literals(result);
41  start_it = result.cbegin();
42  end_it = result.cend();
43  }
44  }
45 }
46 
47 std::vector<std::pair<size_t, size_t>> find_string_literals(const std::string& query) {
48  boost::regex literal_string_regex{R"(([^']+)('(?:[^']+|'')+'))", boost::regex::perl};
49  boost::smatch what;
50  auto it = query.begin();
51  auto prev_it = it;
52  std::vector<std::pair<size_t, size_t>> positions;
53  while (true) {
54  if (!boost::regex_search(it, query.end(), what, literal_string_regex)) {
55  break;
56  }
57  CHECK_GT(what[1].length(), 0);
58  prev_it = it;
59  it += what.length();
60  positions.emplace_back(prev_it + what[1].length() - query.begin(),
61  it - query.begin());
62  }
63  return positions;
64 }
65 
66 std::string hide_sensitive_data_from_query(std::string const& query_str) {
67  constexpr std::regex::flag_type flags =
68  std::regex::ECMAScript | std::regex::icase | std::regex::optimize;
69  static const std::initializer_list<std::pair<std::regex, std::string>> rules{
70  {std::regex(R"(\b((?:password|s3_access_key|s3_secret_key)\s*=\s*)'.+?')", flags),
71  "$1'XXXXXXXX'"},
72  {std::regex(R"((\\set_license\s+)\S+)", flags), "$1XXXXXXXX"}};
73  return std::accumulate(
74  rules.begin(), rules.end(), query_str, [](auto& str, auto& rule) {
75  return std::regex_replace(str, rule.first, rule.second);
76  });
77 }
78 
80  const size_t start,
81  const size_t length,
82  const std::vector<std::pair<size_t, size_t>>& literal_positions) {
83  const auto end = start + length;
84  for (const auto& literal_position : literal_positions) {
85  if (literal_position.first <= start && end <= literal_position.second) {
86  return literal_position.second;
87  }
88  }
89  return -1;
90 }
91 
92 template <>
93 std::string to_string(char const*&& v) {
94  return std::string(v);
95 }
96 
97 template <>
98 std::string to_string(std::string&& v) {
99  return std::move(v);
100 }
101 
102 std::string generate_random_string(const size_t len) {
103  static char charset[] =
104  "0123456789"
105  "abcdefghijklmnopqrstuvwxyz"
106  "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
107 
108  static std::mt19937 prng{std::random_device{}()};
109  static std::uniform_int_distribution<size_t> dist(0, strlen(charset) - 1);
110 
111  std::string str;
112  str.reserve(len);
113  for (size_t i = 0; i < len; i++) {
114  str += charset[dist(prng)];
115  }
116  return str;
117 }
118 
119 #ifndef __CUDACC__
120 // This version of split works almost exactly like Python's split,
121 // which is very convienently-designed.
122 // See also: https://docs.python.org/3.8/library/stdtypes.html#str.split
123 std::vector<std::string> split(std::string_view str,
124  std::string_view delim,
125  std::optional<size_t> maxsplit) {
126  std::vector<std::string> result;
127 
128  // Use an explicit delimiter.
129  if (!delim.empty()) {
130  std::string::size_type i = 0, j = 0;
131  while ((i = str.find(delim, i)) != std::string::npos &&
132  (!maxsplit || result.size() < maxsplit.value())) {
133  result.emplace_back(str, j, i - j);
134  i += delim.size();
135  j = i;
136  }
137  result.emplace_back(str, j, std::string::npos);
138  return result;
139 
140  // Treat any number of consecutive whitespace characters as a delimiter.
141  } else {
142  bool prev_ws = true;
143  std::string::size_type i = 0, j = 0;
144  for (; i < str.size(); ++i) {
145  if (prev_ws) {
146  if (!isspace(str[i])) {
147  // start of word
148  prev_ws = false;
149  j = i;
150  }
151  } else {
152  if (isspace(str[i])) {
153  // start of space
154  result.emplace_back(str, j, i - j);
155  prev_ws = true;
156  j = i;
157  if ((maxsplit && result.size() == maxsplit.value())) {
158  // stop early if maxsplit was reached
159  result.emplace_back(str, j, std::string::npos);
160  return result;
161  }
162  }
163  }
164  }
165  if (!prev_ws) {
166  result.emplace_back(str, j, std::string::npos);
167  }
168  return result;
169  }
170 }
171 
172 std::string strip(std::string_view str) {
173  std::string::size_type i, j;
174  for (i = 0; i < str.size() && std::isspace(str[i]); ++i) {
175  }
176  for (j = str.size(); j > i && std::isspace(str[j - 1]); --j) {
177  }
178  return std::string(str.substr(i, j - i));
179 }
180 #endif // __CUDACC__
181 
183  std::string& str) noexcept {
184  char inside_quote = 0;
185  bool previous_c_was_backslash = false;
186  for (auto& c : str) {
187  // if this character is a quote of either type
188  if (c == '\'' || c == '\"') {
189  // ignore if previous character was a backslash
190  if (!previous_c_was_backslash) {
191  // start or end of a quoted region
192  if (inside_quote == c) {
193  // end region
194  inside_quote = 0;
195  } else if (inside_quote == 0) {
196  // start region
197  inside_quote = c;
198  }
199  }
200  } else if (inside_quote == 0) {
201  // outside quoted region
202  if (c == '\n' || c == '\t' || c == '\r') {
203  // replace these with space
204  c = ' ';
205  }
206  // otherwise leave alone, including quotes of a different type
207  }
208  // handle backslashes, except for double backslashes
209  if (c == '\\') {
210  previous_c_was_backslash = !previous_c_was_backslash;
211  } else {
212  previous_c_was_backslash = false;
213  }
214  }
215  // if we didn't end a region, there were unclosed or mixed-nested quotes
216  // accounting for backslashes should mean that this should only be the
217  // case with truly malformed strings which Calcite will barf on anyway
218  return (inside_quote == 0);
219 }
220 
221 bool unquote(std::string& str) {
222  if (1 < str.size() && (str.front() == '\'' || str.front() == '"') &&
223  str.front() == str.back()) {
224  str.erase(str.size() - 1, 1);
225  str.erase(0, 1);
226  return true;
227  }
228  return false;
229 }
std::string hide_sensitive_data_from_query(std::string const &query_str)
ssize_t inside_string_literal(const size_t start, const size_t length, const std::vector< std::pair< size_t, size_t >> &literal_positions)
std::vector< std::pair< size_t, size_t > > find_string_literals(const std::string &query)
std::string strip(std::string_view str)
trim any whitespace from the left and right ends of a string
#define CHECK_GT(x, y)
Definition: Logger.h:209
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
void start()
Definition: Asio.cpp:33
std::string generate_random_string(const size_t len)
void apply_shim(std::string &result, const boost::regex &reg_expr, const std::function< void(std::string &, const boost::smatch &)> &shim_fn)
bool remove_unquoted_newlines_linefeeds_and_tabs_from_sql_string(std::string &str) noexcept
sanitize an SQL string
bool unquote(std::string &str)