OmniSciDB  ca0c39ec8f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringTransform.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "StringTransform.h"
18 #include "Logger/Logger.h"
19 
20 #include <boost/algorithm/string/classification.hpp> // Include boost::for is_any_of
21 #include <boost/algorithm/string/split.hpp> // Include for boost::split
22 
23 #include <numeric>
24 #include <random>
25 #include <regex>
26 
27 #ifndef __CUDACC__
28 #include <boost/filesystem.hpp>
29 #include <iomanip>
30 #endif
31 
32 void apply_shim(std::string& result,
33  const boost::regex& reg_expr,
34  const std::function<void(std::string&, const boost::smatch&)>& shim_fn) {
35  boost::smatch what;
36  std::vector<std::pair<size_t, size_t>> lit_pos = find_string_literals(result);
37  auto start_it = result.cbegin();
38  auto end_it = result.cend();
39  while (true) {
40  if (!boost::regex_search(start_it, end_it, what, reg_expr)) {
41  break;
42  }
43  const auto next_start =
44  inside_string_literal(what.position(), what.length(), lit_pos);
45  if (next_start) {
46  start_it = result.cbegin() + *next_start;
47  } else {
48  shim_fn(result, what);
49  lit_pos = find_string_literals(result);
50  start_it = result.cbegin();
51  end_it = result.cend();
52  }
53  }
54 }
55 
56 std::vector<std::pair<size_t, size_t>> find_string_literals(const std::string& query) {
57  boost::regex literal_string_regex{R"(([^']+)('(?:[^']+|'')+'))", boost::regex::perl};
58  boost::smatch what;
59  auto it = query.begin();
60  auto prev_it = it;
61  std::vector<std::pair<size_t, size_t>> positions;
62  while (true) {
63  try {
64  if (!boost::regex_search(it, query.end(), what, literal_string_regex)) {
65  break;
66  }
67  } catch (const std::exception& e) {
68  LOG(WARNING) << "Error processing literals: " << e.what()
69  << "\nContinuing query parse...";
70  // boost::regex throws an exception about the complexity of matching when
71  // the wrong type of quotes are used or they're mismatched. Let the query
72  // through unmodified, the parser will throw a much more informative error.
73  // This can also throw on very long queries
74  break;
75  }
76  CHECK_GT(what[1].length(), 0);
77  prev_it = it;
78  it += what.length();
79  positions.emplace_back(prev_it + what[1].length() - query.begin(),
80  it - query.begin());
81  }
82  return positions;
83 }
84 
85 std::string hide_sensitive_data_from_query(std::string const& query_str) {
86  constexpr std::regex::flag_type flags =
87  std::regex::ECMAScript | std::regex::icase | std::regex::optimize;
88  static const std::initializer_list<std::pair<std::regex, std::string>> rules{
89  {std::regex(
90  R"(\b((?:password|s3_access_key|s3_secret_key|s3_session_token|username|credential_string)\s*=\s*)'.+?')",
91  flags),
92  "$1'XXXXXXXX'"},
93  {std::regex(R"((\\set_license\s+)\S+)", flags), "$1XXXXXXXX"}};
94  return std::accumulate(
95  rules.begin(), rules.end(), query_str, [](auto& str, auto& rule) {
96  return std::regex_replace(str, rule.first, rule.second);
97  });
98 }
99 
100 template <>
101 std::string to_string(char const*&& v) {
102  return std::string(v);
103 }
104 
105 template <>
106 std::string to_string(std::string&& v) {
107  return std::move(v);
108 }
109 
110 std::pair<std::string_view, const char*> substring(const std::string& str,
111  size_t substr_length) {
112  // return substring with a post_fix
113  // assume input str is valid and we perform substring starting from str's initial pos
114  // (=0)
115  const auto str_size = str.size();
116  if (substr_length >= str_size) {
117  return {str, ""};
118  }
119  std::string_view substr(str.c_str(), substr_length);
120  return {substr, "..."};
121 }
122 
123 std::string generate_random_string(const size_t len) {
124  static char charset[] =
125  "0123456789"
126  "abcdefghijklmnopqrstuvwxyz"
127  "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
128 
129  static std::mt19937 prng{std::random_device{}()};
130  static std::uniform_int_distribution<size_t> dist(0, strlen(charset) - 1);
131 
132  std::string str;
133  str.reserve(len);
134  for (size_t i = 0; i < len; i++) {
135  str += charset[dist(prng)];
136  }
137  return str;
138 }
139 
140 #ifndef __CUDACC__
141 // This version of split works almost exactly like Python's split,
142 // which is very convienently-designed.
143 // See also: https://docs.python.org/3.8/library/stdtypes.html#str.split
144 std::vector<std::string> split(std::string_view str,
145  std::string_view delim,
146  std::optional<size_t> maxsplit) {
147  std::vector<std::string> result;
148 
149  // Use an explicit delimiter.
150  if (!delim.empty()) {
151  std::string::size_type i = 0, j = 0;
152  while ((i = str.find(delim, i)) != std::string::npos &&
153  (!maxsplit || result.size() < maxsplit.value())) {
154  result.emplace_back(str, j, i - j);
155  i += delim.size();
156  j = i;
157  }
158  result.emplace_back(str, j, std::string::npos);
159  return result;
160 
161  // Treat any number of consecutive whitespace characters as a delimiter.
162  } else {
163  bool prev_ws = true;
164  std::string::size_type i = 0, j = 0;
165  for (; i < str.size(); ++i) {
166  if (prev_ws) {
167  if (!isspace(str[i])) {
168  // start of word
169  prev_ws = false;
170  j = i;
171  }
172  } else {
173  if (isspace(str[i])) {
174  // start of space
175  result.emplace_back(str, j, i - j);
176  prev_ws = true;
177  j = i;
178  if ((maxsplit && result.size() == maxsplit.value())) {
179  // stop early if maxsplit was reached
180  result.emplace_back(str, j, std::string::npos);
181  return result;
182  }
183  }
184  }
185  }
186  if (!prev_ws) {
187  result.emplace_back(str, j, std::string::npos);
188  }
189  return result;
190  }
191 }
192 
193 std::string_view sv_strip(std::string_view str) {
194  std::string::size_type i, j;
195  for (i = 0; i < str.size() && std::isspace(str[i]); ++i) {
196  }
197  for (j = str.size(); j > i && std::isspace(str[j - 1]); --j) {
198  }
199  return str.substr(i, j - i);
200 }
201 
202 std::string strip(std::string_view str) {
203  return std::string(sv_strip(str));
204 }
205 
206 std::optional<size_t> inside_string_literal(
207  const size_t start,
208  const size_t length,
209  const std::vector<std::pair<size_t, size_t>>& literal_positions) {
210  const auto end = start + length;
211  for (const auto& literal_position : literal_positions) {
212  if (literal_position.first <= start && end <= literal_position.second) {
213  return literal_position.second;
214  }
215  }
216  return std::nullopt;
217 }
218 
219 #endif // __CUDACC__
220 
222  std::string& str) noexcept {
223  char inside_quote = 0;
224  bool previous_c_was_backslash = false;
225  for (auto& c : str) {
226  // if this character is a quote of either type
227  if (c == '\'' || c == '\"') {
228  // ignore if previous character was a backslash
229  if (!previous_c_was_backslash) {
230  // start or end of a quoted region
231  if (inside_quote == c) {
232  // end region
233  inside_quote = 0;
234  } else if (inside_quote == 0) {
235  // start region
236  inside_quote = c;
237  }
238  }
239  } else if (inside_quote == 0) {
240  // outside quoted region
241  if (c == '\n' || c == '\t' || c == '\r') {
242  // replace these with space
243  c = ' ';
244  }
245  // otherwise leave alone, including quotes of a different type
246  }
247  // handle backslashes, except for double backslashes
248  if (c == '\\') {
249  previous_c_was_backslash = !previous_c_was_backslash;
250  } else {
251  previous_c_was_backslash = false;
252  }
253  }
254  // if we didn't end a region, there were unclosed or mixed-nested quotes
255  // accounting for backslashes should mean that this should only be the
256  // case with truly malformed strings which Calcite will barf on anyway
257  return (inside_quote == 0);
258 }
259 
260 #ifndef __CUDACC__
261 std::string get_quoted_string(const std::string& filename, char quote, char escape) {
262  std::stringstream ss;
263  ss << std::quoted(filename, quote, escape); // TODO: prevents string_view Jun 2020
264  return ss.str();
265 }
266 #endif // __CUDACC__
267 
268 #ifndef __CUDACC__
269 std::string simple_sanitize(const std::string& str) {
270  auto sanitized_str{str};
271  for (auto& c : sanitized_str) {
272  c = (c < 32) ? ' ' : c;
273  }
274  return sanitized_str;
275 }
276 #endif // __CUDACC__
std::string hide_sensitive_data_from_query(std::string const &query_str)
std::vector< std::pair< size_t, size_t > > find_string_literals(const std::string &query)
std::optional< size_t > inside_string_literal(const size_t start, const size_t length, const std::vector< std::pair< size_t, size_t >> &literal_positions)
std::string_view sv_strip(std::string_view str)
return trimmed string_view
std::string strip(std::string_view str)
trim any whitespace from the left and right ends of a string
#define LOG(tag)
Definition: Logger.h:216
std::string simple_sanitize(const std::string &str)
simple sanitize string (replace control characters with space)
#define CHECK_GT(x, y)
Definition: Logger.h:234
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
std::string get_quoted_string(const std::string &filename, char quote, char escape)
Quote a string while escaping any existing quotes in the string.
std::string generate_random_string(const size_t len)
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
void apply_shim(std::string &result, const boost::regex &reg_expr, const std::function< void(std::string &, const boost::smatch &)> &shim_fn)
bool remove_unquoted_newlines_linefeeds_and_tabs_from_sql_string(std::string &str) noexcept
sanitize an SQL string
std::pair< std::string_view, const char * > substring(const std::string &str, size_t substr_length)
return substring of str with postfix if str.size() &gt; substr_length