OmniSciDB  467d548b97
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringTransform.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "StringTransform.h"
18 #include "Logger/Logger.h"
19 
20 #include <numeric>
21 #include <random>
22 #include <regex>
23 
24 #ifndef __CUDACC__
25 #include <boost/filesystem.hpp>
26 #include <iomanip>
27 #endif
28 
29 void apply_shim(std::string& result,
30  const boost::regex& reg_expr,
31  const std::function<void(std::string&, const boost::smatch&)>& shim_fn) {
32  boost::smatch what;
33  std::vector<std::pair<size_t, size_t>> lit_pos = find_string_literals(result);
34  auto start_it = result.cbegin();
35  auto end_it = result.cend();
36  while (true) {
37  if (!boost::regex_search(start_it, end_it, what, reg_expr)) {
38  break;
39  }
40  const auto next_start =
41  inside_string_literal(what.position(), what.length(), lit_pos);
42  if (next_start) {
43  start_it = result.cbegin() + *next_start;
44  } else {
45  shim_fn(result, what);
46  lit_pos = find_string_literals(result);
47  start_it = result.cbegin();
48  end_it = result.cend();
49  }
50  }
51 }
52 
53 std::vector<std::pair<size_t, size_t>> find_string_literals(const std::string& query) {
54  boost::regex literal_string_regex{R"(([^']+)('(?:[^']+|'')+'))", boost::regex::perl};
55  boost::smatch what;
56  auto it = query.begin();
57  auto prev_it = it;
58  std::vector<std::pair<size_t, size_t>> positions;
59  while (true) {
60  if (!boost::regex_search(it, query.end(), what, literal_string_regex)) {
61  break;
62  }
63  CHECK_GT(what[1].length(), 0);
64  prev_it = it;
65  it += what.length();
66  positions.emplace_back(prev_it + what[1].length() - query.begin(),
67  it - query.begin());
68  }
69  return positions;
70 }
71 
72 std::string hide_sensitive_data_from_query(std::string const& query_str) {
73  constexpr std::regex::flag_type flags =
74  std::regex::ECMAScript | std::regex::icase | std::regex::optimize;
75  static const std::initializer_list<std::pair<std::regex, std::string>> rules{
76  {std::regex(R"(\b((?:password|s3_access_key|s3_secret_key)\s*=\s*)'.+?')", flags),
77  "$1'XXXXXXXX'"},
78  {std::regex(R"((\\set_license\s+)\S+)", flags), "$1XXXXXXXX"}};
79  return std::accumulate(
80  rules.begin(), rules.end(), query_str, [](auto& str, auto& rule) {
81  return std::regex_replace(str, rule.first, rule.second);
82  });
83 }
84 
85 template <>
86 std::string to_string(char const*&& v) {
87  return std::string(v);
88 }
89 
90 template <>
91 std::string to_string(std::string&& v) {
92  return std::move(v);
93 }
94 
95 std::string generate_random_string(const size_t len) {
96  static char charset[] =
97  "0123456789"
98  "abcdefghijklmnopqrstuvwxyz"
99  "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
100 
101  static std::mt19937 prng{std::random_device{}()};
102  static std::uniform_int_distribution<size_t> dist(0, strlen(charset) - 1);
103 
104  std::string str;
105  str.reserve(len);
106  for (size_t i = 0; i < len; i++) {
107  str += charset[dist(prng)];
108  }
109  return str;
110 }
111 
112 #ifndef __CUDACC__
113 // This version of split works almost exactly like Python's split,
114 // which is very convienently-designed.
115 // See also: https://docs.python.org/3.8/library/stdtypes.html#str.split
116 std::vector<std::string> split(std::string_view str,
117  std::string_view delim,
118  std::optional<size_t> maxsplit) {
119  std::vector<std::string> result;
120 
121  // Use an explicit delimiter.
122  if (!delim.empty()) {
123  std::string::size_type i = 0, j = 0;
124  while ((i = str.find(delim, i)) != std::string::npos &&
125  (!maxsplit || result.size() < maxsplit.value())) {
126  result.emplace_back(str, j, i - j);
127  i += delim.size();
128  j = i;
129  }
130  result.emplace_back(str, j, std::string::npos);
131  return result;
132 
133  // Treat any number of consecutive whitespace characters as a delimiter.
134  } else {
135  bool prev_ws = true;
136  std::string::size_type i = 0, j = 0;
137  for (; i < str.size(); ++i) {
138  if (prev_ws) {
139  if (!isspace(str[i])) {
140  // start of word
141  prev_ws = false;
142  j = i;
143  }
144  } else {
145  if (isspace(str[i])) {
146  // start of space
147  result.emplace_back(str, j, i - j);
148  prev_ws = true;
149  j = i;
150  if ((maxsplit && result.size() == maxsplit.value())) {
151  // stop early if maxsplit was reached
152  result.emplace_back(str, j, std::string::npos);
153  return result;
154  }
155  }
156  }
157  }
158  if (!prev_ws) {
159  result.emplace_back(str, j, std::string::npos);
160  }
161  return result;
162  }
163 }
164 
165 std::string strip(std::string_view str) {
166  std::string::size_type i, j;
167  for (i = 0; i < str.size() && std::isspace(str[i]); ++i) {
168  }
169  for (j = str.size(); j > i && std::isspace(str[j - 1]); --j) {
170  }
171  return std::string(str.substr(i, j - i));
172 }
173 
174 std::optional<size_t> inside_string_literal(
175  const size_t start,
176  const size_t length,
177  const std::vector<std::pair<size_t, size_t>>& literal_positions) {
178  const auto end = start + length;
179  for (const auto& literal_position : literal_positions) {
180  if (literal_position.first <= start && end <= literal_position.second) {
181  return literal_position.second;
182  }
183  }
184  return std::nullopt;
185 }
186 
187 #endif // __CUDACC__
188 
190  std::string& str) noexcept {
191  char inside_quote = 0;
192  bool previous_c_was_backslash = false;
193  for (auto& c : str) {
194  // if this character is a quote of either type
195  if (c == '\'' || c == '\"') {
196  // ignore if previous character was a backslash
197  if (!previous_c_was_backslash) {
198  // start or end of a quoted region
199  if (inside_quote == c) {
200  // end region
201  inside_quote = 0;
202  } else if (inside_quote == 0) {
203  // start region
204  inside_quote = c;
205  }
206  }
207  } else if (inside_quote == 0) {
208  // outside quoted region
209  if (c == '\n' || c == '\t' || c == '\r') {
210  // replace these with space
211  c = ' ';
212  }
213  // otherwise leave alone, including quotes of a different type
214  }
215  // handle backslashes, except for double backslashes
216  if (c == '\\') {
217  previous_c_was_backslash = !previous_c_was_backslash;
218  } else {
219  previous_c_was_backslash = false;
220  }
221  }
222  // if we didn't end a region, there were unclosed or mixed-nested quotes
223  // accounting for backslashes should mean that this should only be the
224  // case with truly malformed strings which Calcite will barf on anyway
225  return (inside_quote == 0);
226 }
227 
228 #ifndef __CUDACC__
229 std::string get_quoted_string(const std::string& filename, char quote, char escape) {
230  std::stringstream ss;
231  ss << std::quoted(filename, quote, escape); // TODO: prevents string_view Jun 2020
232  return ss.str();
233 }
234 #endif // __CUDACC__
std::string hide_sensitive_data_from_query(std::string const &query_str)
std::string filename(char const *path)
Definition: Logger.cpp:62
std::vector< std::pair< size_t, size_t > > find_string_literals(const std::string &query)
std::optional< size_t > inside_string_literal(const size_t start, const size_t length, const std::vector< std::pair< size_t, size_t >> &literal_positions)
std::string strip(std::string_view str)
trim any whitespace from the left and right ends of a string
#define CHECK_GT(x, y)
Definition: Logger.h:209
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
std::string get_quoted_string(const std::string &filename, char quote, char escape)
Quote a string while escaping any existing quotes in the string.
std::string generate_random_string(const size_t len)
void apply_shim(std::string &result, const boost::regex &reg_expr, const std::function< void(std::string &, const boost::smatch &)> &shim_fn)
bool remove_unquoted_newlines_linefeeds_and_tabs_from_sql_string(std::string &str) noexcept
sanitize an SQL string