OmniSciDB  06b3bd477c
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringTransform.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "StringTransform.h"
18 
19 #include <numeric>
20 #include <random>
21 #include <regex>
22 
23 #ifndef __CUDACC__
24 #include <boost/filesystem.hpp>
25 #include <iomanip>
26 #endif
27 
28 void apply_shim(std::string& result,
29  const boost::regex& reg_expr,
30  const std::function<void(std::string&, const boost::smatch&)>& shim_fn) {
31  boost::smatch what;
32  std::vector<std::pair<size_t, size_t>> lit_pos = find_string_literals(result);
33  auto start_it = result.cbegin();
34  auto end_it = result.cend();
35  while (true) {
36  if (!boost::regex_search(start_it, end_it, what, reg_expr)) {
37  break;
38  }
39  const auto next_start =
40  inside_string_literal(what.position(), what.length(), lit_pos);
41  if (next_start >= 0) {
42  start_it = result.cbegin() + next_start;
43  } else {
44  shim_fn(result, what);
45  lit_pos = find_string_literals(result);
46  start_it = result.cbegin();
47  end_it = result.cend();
48  }
49  }
50 }
51 
52 std::vector<std::pair<size_t, size_t>> find_string_literals(const std::string& query) {
53  boost::regex literal_string_regex{R"(([^']+)('(?:[^']+|'')+'))", boost::regex::perl};
54  boost::smatch what;
55  auto it = query.begin();
56  auto prev_it = it;
57  std::vector<std::pair<size_t, size_t>> positions;
58  while (true) {
59  if (!boost::regex_search(it, query.end(), what, literal_string_regex)) {
60  break;
61  }
62  CHECK_GT(what[1].length(), 0);
63  prev_it = it;
64  it += what.length();
65  positions.emplace_back(prev_it + what[1].length() - query.begin(),
66  it - query.begin());
67  }
68  return positions;
69 }
70 
71 std::string hide_sensitive_data_from_query(std::string const& query_str) {
72  constexpr std::regex::flag_type flags =
73  std::regex::ECMAScript | std::regex::icase | std::regex::optimize;
74  static const std::initializer_list<std::pair<std::regex, std::string>> rules{
75  {std::regex(R"(\b((?:password|s3_access_key|s3_secret_key)\s*=\s*)'.+?')", flags),
76  "$1'XXXXXXXX'"},
77  {std::regex(R"((\\set_license\s+)\S+)", flags), "$1XXXXXXXX"}};
78  return std::accumulate(
79  rules.begin(), rules.end(), query_str, [](auto& str, auto& rule) {
80  return std::regex_replace(str, rule.first, rule.second);
81  });
82 }
83 
85  const size_t start,
86  const size_t length,
87  const std::vector<std::pair<size_t, size_t>>& literal_positions) {
88  const auto end = start + length;
89  for (const auto& literal_position : literal_positions) {
90  if (literal_position.first <= start && end <= literal_position.second) {
91  return literal_position.second;
92  }
93  }
94  return -1;
95 }
96 
97 template <>
98 std::string to_string(char const*&& v) {
99  return std::string(v);
100 }
101 
102 template <>
103 std::string to_string(std::string&& v) {
104  return std::move(v);
105 }
106 
107 std::string generate_random_string(const size_t len) {
108  static char charset[] =
109  "0123456789"
110  "abcdefghijklmnopqrstuvwxyz"
111  "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
112 
113  static std::mt19937 prng{std::random_device{}()};
114  static std::uniform_int_distribution<size_t> dist(0, strlen(charset) - 1);
115 
116  std::string str;
117  str.reserve(len);
118  for (size_t i = 0; i < len; i++) {
119  str += charset[dist(prng)];
120  }
121  return str;
122 }
123 
124 #ifndef __CUDACC__
125 // This version of split works almost exactly like Python's split,
126 // which is very convienently-designed.
127 // See also: https://docs.python.org/3.8/library/stdtypes.html#str.split
128 std::vector<std::string> split(std::string_view str,
129  std::string_view delim,
130  std::optional<size_t> maxsplit) {
131  std::vector<std::string> result;
132 
133  // Use an explicit delimiter.
134  if (!delim.empty()) {
135  std::string::size_type i = 0, j = 0;
136  while ((i = str.find(delim, i)) != std::string::npos &&
137  (!maxsplit || result.size() < maxsplit.value())) {
138  result.emplace_back(str, j, i - j);
139  i += delim.size();
140  j = i;
141  }
142  result.emplace_back(str, j, std::string::npos);
143  return result;
144 
145  // Treat any number of consecutive whitespace characters as a delimiter.
146  } else {
147  bool prev_ws = true;
148  std::string::size_type i = 0, j = 0;
149  for (; i < str.size(); ++i) {
150  if (prev_ws) {
151  if (!isspace(str[i])) {
152  // start of word
153  prev_ws = false;
154  j = i;
155  }
156  } else {
157  if (isspace(str[i])) {
158  // start of space
159  result.emplace_back(str, j, i - j);
160  prev_ws = true;
161  j = i;
162  if ((maxsplit && result.size() == maxsplit.value())) {
163  // stop early if maxsplit was reached
164  result.emplace_back(str, j, std::string::npos);
165  return result;
166  }
167  }
168  }
169  }
170  if (!prev_ws) {
171  result.emplace_back(str, j, std::string::npos);
172  }
173  return result;
174  }
175 }
176 
177 std::string strip(std::string_view str) {
178  std::string::size_type i, j;
179  for (i = 0; i < str.size() && std::isspace(str[i]); ++i) {
180  }
181  for (j = str.size(); j > i && std::isspace(str[j - 1]); --j) {
182  }
183  return std::string(str.substr(i, j - i));
184 }
185 #endif // __CUDACC__
186 
188  std::string& str) noexcept {
189  char inside_quote = 0;
190  bool previous_c_was_backslash = false;
191  for (auto& c : str) {
192  // if this character is a quote of either type
193  if (c == '\'' || c == '\"') {
194  // ignore if previous character was a backslash
195  if (!previous_c_was_backslash) {
196  // start or end of a quoted region
197  if (inside_quote == c) {
198  // end region
199  inside_quote = 0;
200  } else if (inside_quote == 0) {
201  // start region
202  inside_quote = c;
203  }
204  }
205  } else if (inside_quote == 0) {
206  // outside quoted region
207  if (c == '\n' || c == '\t' || c == '\r') {
208  // replace these with space
209  c = ' ';
210  }
211  // otherwise leave alone, including quotes of a different type
212  }
213  // handle backslashes, except for double backslashes
214  if (c == '\\') {
215  previous_c_was_backslash = !previous_c_was_backslash;
216  } else {
217  previous_c_was_backslash = false;
218  }
219  }
220  // if we didn't end a region, there were unclosed or mixed-nested quotes
221  // accounting for backslashes should mean that this should only be the
222  // case with truly malformed strings which Calcite will barf on anyway
223  return (inside_quote == 0);
224 }
225 
226 bool unquote(std::string& str) {
227  if (1 < str.size() && (str.front() == '\'' || str.front() == '"') &&
228  str.front() == str.back()) {
229  str.erase(str.size() - 1, 1);
230  str.erase(0, 1);
231  return true;
232  }
233  return false;
234 }
235 
236 #ifndef __CUDACC__
237 std::string get_quoted_string(const std::string& filename, char quote, char escape) {
238  std::stringstream ss;
239  ss << std::quoted(filename, quote, escape); // TODO: prevents string_view Jun 2020
240  return ss.str();
241 }
242 
243 void filename_security_check(const std::string& filename) {
244  // We can always relax some of these rules later.
245 
246  // Canonicalize the filename, rejecting it if this basic step fails.
248  auto can = boost::filesystem::weakly_canonical(
249  filename, ec); // TODO: prevents string_view Jun 2020
250  if (ec) {
251  throw std::runtime_error("invalid filename: " + filename);
252  }
253 
254  // Reject any filenames containing whitespace for now.
255  for (const auto& ch : filename) {
256  if (std::isspace(ch)) {
257  throw std::runtime_error("invalid filename (whitespace): " + filename);
258  }
259  }
260 
261  // Reject any punctuation characters except for a few safe ones.
262  static const std::string safe_punctuation{"./_+-=:"};
263  for (const auto& ch : filename) {
264  if (std::ispunct(ch) && safe_punctuation.find(ch) == std::string::npos) {
265  throw std::runtime_error("invalid filename (punctuation): " + filename);
266  }
267  }
268 
269  // Reject any blacklisted filenames.
270  static const std::vector<std::string> blacklisted_filenames = {
271  "/etc/passwd", "/etc/passwd-", "/etc/shadow", "/etc/shadow-"};
272  if (std::find(blacklisted_filenames.begin(),
273  blacklisted_filenames.end(),
274  can.string()) != blacklisted_filenames.end()) {
275  throw std::runtime_error("invalid filename (blacklist): " + filename);
276  }
277 }
278 #endif // __CUDACC__
std::string hide_sensitive_data_from_query(std::string const &query_str)
std::string filename(char const *path)
Definition: Logger.cpp:62
ssize_t inside_string_literal(const size_t start, const size_t length, const std::vector< std::pair< size_t, size_t >> &literal_positions)
std::vector< std::pair< size_t, size_t > > find_string_literals(const std::string &query)
std::string strip(std::string_view str)
trim any whitespace from the left and right ends of a string
#define CHECK_GT(x, y)
Definition: Logger.h:209
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
std::string get_quoted_string(const std::string &filename, char quote, char escape)
Quote a string while escaping any existing quotes in the string.
void filename_security_check(const std::string &filename)
Throw exception if security problems found in a filename.
std::string generate_random_string(const size_t len)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t * error_code
void apply_shim(std::string &result, const boost::regex &reg_expr, const std::function< void(std::string &, const boost::smatch &)> &shim_fn)
bool remove_unquoted_newlines_linefeeds_and_tabs_from_sql_string(std::string &str) noexcept
sanitize an SQL string
bool unquote(std::string &str)