OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOps.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "Logger/Logger.h"
21 #include "Shared/sqldefs.h"
22 #include "Shared/sqltypes.h"
23 #include "StringOpInfo.h"
24 
25 #include <algorithm>
26 #include <bitset>
27 #include <cctype>
28 #include <cmath>
29 #include <map>
30 #include <memory>
31 #include <optional>
32 #include <string>
33 #include <string_view>
34 #include <utility>
35 #include <vector>
36 
37 namespace StringOps_Namespace {
38 
39 struct NullableStrType {
40  NullableStrType(const std::string& str) : str(str), is_null(str.empty()) {}
41  NullableStrType(const std::string_view sv) : str(sv), is_null(sv.empty()) {}
42  NullableStrType() : is_null(true) {}
43 
44  std::pair<std::string, bool> toPair() const { return {str, is_null}; }
45 
46  std::string str;
47  bool is_null;
48 };
49 
50 struct StringOp {
51  public:
52  StringOp(const SqlStringOpKind op_kind,
53  const std::optional<std::string>& var_str_optional_literal)
54  : op_kind_(op_kind)
55  , return_ti_(SQLTypeInfo(kTEXT))
56  , has_var_str_literal_(var_str_optional_literal.has_value())
57  , var_str_literal_(!var_str_optional_literal.has_value()
58  ? NullableStrType()
59  : NullableStrType(var_str_optional_literal.value())) {}
60 
61  StringOp(const SqlStringOpKind op_kind,
62  const SQLTypeInfo& return_ti,
63  const std::optional<std::string>& var_str_optional_literal)
64  : op_kind_(op_kind)
65  , return_ti_(return_ti)
66  , has_var_str_literal_(var_str_optional_literal.has_value())
67  , var_str_literal_(!var_str_optional_literal.has_value()
68  ? NullableStrType()
69  : NullableStrType(var_str_optional_literal.value())) {}
70 
71  virtual ~StringOp() = default;
72 
73  virtual NullableStrType operator()(std::string const&) const = 0;
74 
75  virtual NullableStrType operator()(const std::string& str1,
76  const std::string& str2) const {
77  UNREACHABLE() << "operator(str1, str2) not allowed for this method";
78  // Make compiler happy
79  return NullableStrType();
80  }
81 
82  virtual NullableStrType operator()() const {
83  CHECK(hasVarStringLiteral());
84  if (var_str_literal_.is_null) {
85  return var_str_literal_;
86  }
87  return operator()(var_str_literal_.str);
88  }
89 
90  virtual Datum numericEval(const std::string_view str) const {
91  UNREACHABLE() << "numericEval not allowed for this method";
92  // Make compiler happy
93  return NullDatum(SQLTypeInfo());
94  }
95 
96  virtual Datum numericEval() const {
97  CHECK(hasVarStringLiteral());
98  if (var_str_literal_.is_null) {
99  return NullDatum(return_ti_);
100  }
101  return numericEval(var_str_literal_.str);
102  }
103 
104  virtual const SQLTypeInfo& getReturnType() const { return return_ti_; }
105 
106  const std::string& getVarStringLiteral() const {
107  CHECK(hasVarStringLiteral());
108  return var_str_literal_.str;
109  }
110 
111  bool hasVarStringLiteral() const { return has_var_str_literal_; }
112 
113  protected:
114  static boost::regex generateRegex(const std::string& op_name,
115  const std::string& regex_pattern,
116  const std::string& regex_params,
117  const bool supports_sub_matches);
118 
119  const SqlStringOpKind op_kind_;
120  const SQLTypeInfo return_ti_;
121  const bool has_var_str_literal_{false};
122  const NullableStrType var_str_literal_;
123 };
124 
125 struct TryStringCast : public StringOp {
126  public:
127  TryStringCast(const SQLTypeInfo& return_ti,
128  const std::optional<std::string>& var_str_optional_literal)
129  : StringOp(SqlStringOpKind::TRY_STRING_CAST, return_ti, var_str_optional_literal) {}
130 
131  NullableStrType operator()(const std::string& str) const override;
132  Datum numericEval(const std::string_view str) const override;
133 };
134 
135 struct Position : public StringOp {
136  public:
137  Position(const std::optional<std::string>& var_str_optional_literal,
138  const std::string& search_str)
139  : StringOp(SqlStringOpKind::POSITION,
141  var_str_optional_literal)
142  , search_str_(search_str)
143  , start_(0) {}
144 
145  Position(const std::optional<std::string>& var_str_optional_literal,
146  const std::string& search_str,
147  const int64_t start)
148  : StringOp(SqlStringOpKind::POSITION,
150  var_str_optional_literal)
151  , search_str_(search_str)
152  , start_(start > 0 ? start - 1 : start) {}
153 
154  NullableStrType operator()(const std::string& str) const override;
155  Datum numericEval(const std::string_view str) const override;
156 
157  private:
158  const std::string search_str_;
159  const int64_t start_;
160 };
161 
162 struct Lower : public StringOp {
163  Lower(const std::optional<std::string>& var_str_optional_literal)
164  : StringOp(SqlStringOpKind::LOWER, var_str_optional_literal) {}
165 
166  NullableStrType operator()(const std::string& str) const override;
167 };
168 
169 struct Upper : public StringOp {
170  Upper(const std::optional<std::string>& var_str_optional_literal)
171  : StringOp(SqlStringOpKind::UPPER, var_str_optional_literal) {}
172  NullableStrType operator()(const std::string& str) const override;
173 };
174 
175 inline std::bitset<256> build_char_bitmap(const std::string& chars_to_set) {
176  std::bitset<256> char_bitmap;
177  for (const auto& str_char : chars_to_set) {
178  char_bitmap.set(str_char);
179  }
180  return char_bitmap;
181 }
182 
183 struct InitCap : public StringOp {
184  InitCap(const std::optional<std::string>& var_str_optional_literal)
185  : StringOp(SqlStringOpKind::INITCAP, var_str_optional_literal)
186  , delimiter_bitmap_(build_char_bitmap(InitCap::delimiter_chars)) {}
187 
188  NullableStrType operator()(const std::string& str) const override;
189 
190  private:
191  static constexpr char const* delimiter_chars = R"(!?@"^#$&~_,.:;+-*%/|\[](){}<>)";
192  const std::bitset<256> delimiter_bitmap_;
193 };
194 
195 struct Reverse : public StringOp {
196  Reverse(const std::optional<std::string>& var_str_optional_literal)
197  : StringOp(SqlStringOpKind::REVERSE, var_str_optional_literal) {}
198 
199  NullableStrType operator()(const std::string& str) const override;
200 };
201 
202 struct Repeat : public StringOp {
203  public:
204  Repeat(const std::optional<std::string>& var_str_optional_literal, const int64_t n)
205  : StringOp(SqlStringOpKind::REPEAT, var_str_optional_literal)
206  , n_(n >= 0 ? n : 0UL) {
207  if (n < 0) {
208  throw std::runtime_error("Number of repeats must be >= 0");
209  }
210  }
211 
212  NullableStrType operator()(const std::string& str) const override;
213 
214  private:
215  const size_t n_;
216 };
217 
218 struct Concat : public StringOp {
219  Concat(const std::optional<std::string>& var_str_optional_literal,
220  const std::string& str_literal,
221  const bool reverse_order)
222  : StringOp(reverse_order ? SqlStringOpKind::RCONCAT : SqlStringOpKind::CONCAT,
223  var_str_optional_literal)
224  , str_literal_(str_literal)
225  , reverse_order_(reverse_order) {}
226 
227  Concat(const std::optional<std::string>& var_str_optional_literal)
228  : StringOp(SqlStringOpKind::CONCAT, var_str_optional_literal)
229  , reverse_order_(false) {}
230 
231  NullableStrType operator()(const std::string& str) const override;
232 
233  NullableStrType operator()(const std::string& str1,
234  const std::string& str2) const override;
235 
236  const std::string str_literal_;
237  const bool reverse_order_;
238 };
239 
240 struct Pad : public StringOp {
241  public:
242  enum class PadMode { LEFT, RIGHT };
243 
244  Pad(const std::optional<std::string>& var_str_optional_literal,
245  const SqlStringOpKind op_kind,
246  const int64_t padded_length,
247  const std::string& padding_string)
248  : StringOp(op_kind, var_str_optional_literal)
249  , pad_mode_(Pad::op_kind_to_pad_mode(op_kind))
250  , padded_length_(static_cast<size_t>(padded_length))
251  , padding_string_(padding_string.empty() ? " " : padding_string)
252  , padding_string_length_(padding_string.size())
253  , padding_char_(padding_string.empty() ? ' ' : padding_string[0]) {}
254 
255  NullableStrType operator()(const std::string& str) const override;
256 
257  private:
258  std::string lpad(const std::string& str) const;
259 
260  std::string rpad(const std::string& str) const;
261 
262  static PadMode op_kind_to_pad_mode(const SqlStringOpKind op_kind);
263 
264  const PadMode pad_mode_;
265  const size_t padded_length_;
266  const std::string padding_string_;
267  const size_t padding_string_length_;
268  const char padding_char_;
269 };
270 
271 struct Trim : public StringOp {
272  public:
273  enum class TrimMode { LEFT, RIGHT, BOTH };
274 
275  Trim(const std::optional<std::string>& var_str_optional_literal,
276  const SqlStringOpKind op_kind,
277  const std::string& trim_chars)
278  : StringOp(op_kind, var_str_optional_literal)
279  , trim_mode_(Trim::op_kind_to_trim_mode(op_kind))
280  , trim_char_bitmap_(build_char_bitmap(trim_chars.empty() ? " " : trim_chars)) {}
281 
282  NullableStrType operator()(const std::string& str) const override;
283 
284  private:
285  static TrimMode op_kind_to_trim_mode(const SqlStringOpKind op_kind);
286 
287  const TrimMode trim_mode_;
288  const std::bitset<256> trim_char_bitmap_;
289 };
290 
291 struct Substring : public StringOp {
292  // First constructor is for CALCITE SUBSTRING(str FROM start_pos),
293  // which returns the substring of str from start_pos
294  // until the end of the string
295  // Note start_pos is 1-indexed, unless input is 0
296 
297  Substring(const std::optional<std::string>& var_str_optional_literal,
298  const int64_t start)
299  : StringOp(SqlStringOpKind::SUBSTRING, var_str_optional_literal)
300  , start_(start > 0 ? start - 1 : start)
301  , length_(std::string::npos) {}
302 
303  // Second constructor is for CALCITE
304  // SUBSTRING(str FROM start_pos FOR length),
305  // which copies from start_pos for length characters
306  // Note start_pos is 1-indexed, unless input is 0
307  Substring(const std::optional<std::string>& var_str_optional_literal,
308  const int64_t start,
309  const int64_t length)
310  : StringOp(SqlStringOpKind::SUBSTRING, var_str_optional_literal)
311  , start_(start > 0 ? start - 1 : start)
312  , length_(static_cast<size_t>(length >= 0 ? length : 0)) {}
313 
314  NullableStrType operator()(const std::string& str) const override;
315 
316  // Make string_view version?
317  const int64_t start_;
318  const size_t length_;
319 };
320 
321 struct Overlay : public StringOp {
322  Overlay(const std::optional<std::string>& var_str_optional_literal,
323  const std::string& insert_str,
324  const int64_t start)
325  : StringOp(SqlStringOpKind::OVERLAY, var_str_optional_literal)
326  , insert_str_(insert_str)
327  , start_(start > 0 ? start - 1 : start)
328  , replacement_length_(insert_str_.size()) {}
329 
330  Overlay(const std::optional<std::string>& var_str_optional_literal,
331  const std::string& insert_str,
332  const int64_t start,
333  const int64_t replacement_length)
334  : StringOp(SqlStringOpKind::OVERLAY, var_str_optional_literal)
335  , insert_str_(insert_str)
336  , start_(start > 0 ? start - 1 : start)
337  , replacement_length_(
338  static_cast<size_t>(replacement_length >= 0 ? replacement_length : 0)) {}
339 
340  NullableStrType operator()(const std::string& base_str) const override;
341 
342  // Make string_view version?
343  const std::string insert_str_;
344  const int64_t start_;
345  const size_t replacement_length_;
346 };
347 
348 struct Replace : public StringOp {
349  Replace(const std::optional<std::string>& var_str_optional_literal,
350  const std::string& pattern_str,
351  const std::string& replacement_str)
352  : StringOp(SqlStringOpKind::REPLACE, var_str_optional_literal)
353  , pattern_str_(pattern_str)
354  , replacement_str_(replacement_str)
355  , pattern_str_len_(pattern_str.size())
356  , replacement_str_len_(replacement_str.size()) {}
357 
358  NullableStrType operator()(const std::string& str) const override;
359 
360  const std::string pattern_str_;
361  const std::string replacement_str_;
362  const size_t pattern_str_len_;
363  const size_t replacement_str_len_;
364 };
365 
366 struct SplitPart : public StringOp {
367  SplitPart(const std::optional<std::string>& var_str_optional_literal,
368  const std::string& delimiter,
369  const int64_t split_part)
370  : StringOp(SqlStringOpKind::SPLIT_PART, var_str_optional_literal)
371  , delimiter_(delimiter)
372  , split_part_(split_part == 0 ? 1UL : std::abs(split_part))
373  , delimiter_length_(delimiter.size())
374  , reverse_(split_part < 0) {}
375 
376  NullableStrType operator()(const std::string& str) const override;
377 
378  // Make string_view version?
379 
380  const std::string delimiter_;
381  const size_t split_part_;
382  const size_t delimiter_length_;
383  const bool reverse_;
384 };
385 
386 struct RegexpSubstr : public StringOp {
387  public:
388  RegexpSubstr(const std::optional<std::string>& var_str_optional_literal,
389  const std::string& regex_pattern,
390  const int64_t start_pos,
391  const int64_t occurrence,
392  const std::string& regex_params,
393  const int64_t sub_match_group_idx)
394  : StringOp(SqlStringOpKind::REGEXP_SUBSTR, var_str_optional_literal)
395  , regex_pattern_str_(
396  regex_pattern) // for toString() as std::regex does not have str() method
397  , regex_pattern_(
398  StringOp::generateRegex("REGEXP_SUBSTR", regex_pattern, regex_params, true))
399  , start_pos_(start_pos > 0 ? start_pos - 1 : start_pos)
400  , occurrence_(occurrence > 0 ? occurrence - 1 : occurrence)
401  , sub_match_info_(set_sub_match_info(regex_params, sub_match_group_idx)) {}
402 
403  NullableStrType operator()(const std::string& str) const override;
404 
405  private:
406  static std::string get_sub_match(const boost::smatch& match,
407  const std::pair<bool, int64_t> sub_match_info);
408 
409  static std::pair<bool, int64_t> set_sub_match_info(const std::string& regex_pattern,
410  const int64_t sub_match_group_idx);
411 
412  const std::string regex_pattern_str_;
413  const boost::regex regex_pattern_;
414  const int64_t start_pos_;
415  const int64_t occurrence_;
416  const std::pair<bool, int64_t> sub_match_info_;
417 };
418 
419 struct RegexpReplace : public StringOp {
420  public:
421  RegexpReplace(const std::optional<std::string>& var_str_optional_literal,
422  const std::string& regex_pattern,
423  const std::string& replacement,
424  const int64_t start_pos,
425  const int64_t occurrence,
426  const std::string& regex_params)
427  : StringOp(SqlStringOpKind::REGEXP_REPLACE, var_str_optional_literal)
428  , regex_pattern_str_(
429  regex_pattern) // for toString() as std::regex does not have str() method
430  , regex_pattern_(
431  StringOp::generateRegex("REGEXP_REPLACE", regex_pattern, regex_params, false))
432  , replacement_(replacement)
433  , start_pos_(start_pos > 0 ? start_pos - 1 : start_pos)
434  , occurrence_(occurrence) {}
435 
436  NullableStrType operator()(const std::string& str) const override;
437 
438  private:
439  static std::pair<size_t, size_t> get_nth_regex_match(const std::string& str,
440  const size_t start_pos,
441  const boost::regex& regex_pattern,
442  const int64_t occurrence);
443 
444  const std::string regex_pattern_str_;
445  const boost::regex regex_pattern_;
446  const std::string replacement_;
447  const int64_t start_pos_;
448  const int64_t occurrence_;
449 };
450 
451 // We currently do not allow strict mode JSON parsing per the SQL standard, as
452 // 1) We can't throw run-time errors in the case that the string operator
453 // is evaluated in an actual kernel, which is the case for none-encoded text
454 // inputs, and would need to capture the parsing and key errors and set
455 // kernel error flags accordingly. Currently throwing an error in even a CPU
456 // kernel will crash the server as it's not caught (by design, as executor kernels
457 // use error codes so that GPU and CPU code can throw errors).
458 // 2) When JSON_VALUE (or other not-yet-implemented JSON operators) is run over
459 // a string dictionary, if the column shares a dictionary such that the dictionary
460 // contains entries not in the column, we can throw errors for fields not in the
461 // actual column, as we compute JSON_VALUE for all values in the dictionary
462 // pre-kernel launch to build the string dictionary translation map. Since the
463 // offending values may not actually be in the column (when it references a
464 // shared dict), there is not even a way for the user to filter out or
465 // case-guard the offending values
466 // Todo(todd): Implement proper error infra for StringOps, both for the
467 // none-encoded and dictionary encoded paths
468 
469 struct JsonValue : public StringOp {
470  public:
471  JsonValue(const std::optional<std::string>& var_str_optional_literal,
472  const std::string& json_path)
473  : StringOp(SqlStringOpKind::JSON_VALUE, var_str_optional_literal)
474  , json_parse_mode_(parse_json_parse_mode(json_path))
475  , json_keys_(parse_json_path(json_path)) {}
476 
477  NullableStrType operator()(const std::string& str) const override;
478 
479  private:
480  enum class JsonKeyKind { JSON_OBJECT, JSON_ARRAY };
481  enum class JsonParseMode { PARSE_MODE_LAX, PARSE_MODE_STRICT };
482 
483  struct JsonKey {
484  JsonKeyKind key_kind;
485  std::string object_key;
486  // Todo (todd): Support array ranges ala SQL Server
487  size_t array_key;
488 
489  JsonKey(const std::string& object_key)
490  : key_kind(JsonKeyKind::JSON_OBJECT), object_key(object_key) {}
491  JsonKey(const size_t array_key)
492  : key_kind(JsonKeyKind::JSON_ARRAY), array_key(array_key) {}
493  };
494 
495  static JsonParseMode parse_json_parse_mode(std::string_view json_path);
496  static std::vector<JsonKey> parse_json_path(const std::string& json_path);
497  inline NullableStrType handle_parse_error(const std::string& json_str) const {
498  if (json_parse_mode_ == JsonParseMode::PARSE_MODE_LAX) {
499  return NullableStrType();
500  } else {
501  throw std::runtime_error("Could not parse: " + json_str + ".");
502  }
503  }
504 
505  inline NullableStrType handle_key_error(const std::string& json_str) const {
506  if (json_parse_mode_ == JsonParseMode::PARSE_MODE_LAX) {
507  return NullableStrType();
508  } else {
509  throw std::runtime_error("Key not found or did not contain value in: " + json_str +
510  ".");
511  }
512  }
513  static constexpr bool allow_strict_json_parsing{false};
514  const JsonParseMode json_parse_mode_; // If PARSE_MODE_LAX return null and don't throw
515  // error on parsing error
516  const std::vector<JsonKey> json_keys_;
517 };
518 
519 struct Base64Encode : public StringOp {
520  Base64Encode(const std::optional<std::string>& var_str_optional_literal)
521  : StringOp(SqlStringOpKind::BASE64_ENCODE, var_str_optional_literal) {}
522 
523  NullableStrType operator()(const std::string& str) const override;
524 };
525 
526 struct Base64Decode : public StringOp {
527  Base64Decode(const std::optional<std::string>& var_str_optional_literal)
528  : StringOp(SqlStringOpKind::BASE64_DECODE, var_str_optional_literal) {}
529 
530  NullableStrType operator()(const std::string& str) const override;
531 };
532 
533 struct NullOp : public StringOp {
534  NullOp(const std::optional<std::string>& var_str_optional_literal,
535  const SqlStringOpKind op_kind)
536  : StringOp(SqlStringOpKind::INVALID, var_str_optional_literal), op_kind_(op_kind) {}
537 
538  NullableStrType operator()(const std::string& str) const override {
539  return NullableStrType(); // null string
540  }
541 
542  const SqlStringOpKind op_kind_;
543 };
544 
545 std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info);
546 
547 std::pair<std::string, bool /* is null */> apply_string_op_to_literals(
548  const StringOpInfo& string_op_info);
549 
550 Datum apply_numeric_op_to_literals(const StringOpInfo& string_op_info);
551 
552 class StringOps {
553  public:
554  StringOps() : string_ops_(genStringOpsFromOpInfos({})), num_ops_(0UL) {}
555 
556  StringOps(const std::vector<StringOpInfo>& string_op_infos)
557  : string_ops_(genStringOpsFromOpInfos(string_op_infos))
558  , num_ops_(string_op_infos.size()) {}
559 
560  std::string operator()(const std::string& str) const;
561 
562  std::string multi_input_eval(const std::string_view str1,
563  const std::string_view str2) const;
564 
565  std::string_view operator()(const std::string_view sv, std::string& sv_storage) const;
566 
567  Datum numericEval(const std::string_view str) const;
568 
569  size_t size() const { return num_ops_; }
570 
571  private:
572  std::vector<std::unique_ptr<const StringOp>> genStringOpsFromOpInfos(
573  const std::vector<StringOpInfo>& string_op_infos) const;
574 
575  const std::vector<std::unique_ptr<const StringOp>> string_ops_;
576  const size_t num_ops_;
577 };
578 
579 } // namespace StringOps_Namespace
Datum apply_numeric_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:916
const std::string json_str(const rapidjson::Value &obj) noexcept
Definition: JsonAccessors.h:44
#define UNREACHABLE()
Definition: Logger.h:337
SqlStringOpKind
Definition: sqldefs.h:89
Constants for Builtin SQL Types supported by HEAVY.AI.
CONSTEXPR DEVICE bool is_null(const T &value)
std::pair< std::string, bool > apply_string_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:905
bool g_enable_smem_group_by true
Datum NullDatum(const SQLTypeInfo &ti)
Definition: Datum.cpp:286
Definition: sqltypes.h:69
bool g_enable_watchdog false
Definition: Execute.cpp:79
#define CHECK(condition)
Definition: Logger.h:291
Common Enum definitions for SQL processing.
constexpr double n
Definition: Utm.h:38
Definition: Datum.h:67
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:715