OmniSciDB  b28c0d5765
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOps.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "Logger/Logger.h"
21 #include "Shared/sqldefs.h"
22 #include "Shared/sqltypes.h"
23 #include "StringOpInfo.h"
24 
25 #include <algorithm>
26 #include <bitset>
27 #include <cctype>
28 #include <cmath>
29 #include <map>
30 #include <memory>
31 #include <optional>
32 #include <string>
33 #include <string_view>
34 #include <utility>
35 #include <vector>
36 
37 namespace StringOps_Namespace {
38 
39 struct NullableStrType {
40  NullableStrType(const std::string& str) : str(str), is_null(str.empty()) {}
41  NullableStrType(const std::string_view sv) : str(sv), is_null(sv.empty()) {}
42  NullableStrType() : is_null(true) {}
43 
44  std::pair<std::string, bool> toPair() const { return {str, is_null}; }
45 
46  std::string str;
47  bool is_null;
48 };
49 
50 struct StringOp {
51  public:
52  StringOp(const SqlStringOpKind op_kind,
53  const std::optional<std::string>& var_str_optional_literal)
54  : op_kind_(op_kind)
55  , return_ti_(SQLTypeInfo(kTEXT))
56  , has_var_str_literal_(var_str_optional_literal.has_value())
57  , var_str_literal_(!var_str_optional_literal.has_value()
58  ? NullableStrType()
59  : NullableStrType(var_str_optional_literal.value())) {}
60 
61  StringOp(const SqlStringOpKind op_kind,
62  const SQLTypeInfo& return_ti,
63  const std::optional<std::string>& var_str_optional_literal)
64  : op_kind_(op_kind)
65  , return_ti_(return_ti)
66  , has_var_str_literal_(var_str_optional_literal.has_value())
67  , var_str_literal_(!var_str_optional_literal.has_value()
68  ? NullableStrType()
69  : NullableStrType(var_str_optional_literal.value())) {}
70 
71  virtual ~StringOp() = default;
72 
73  virtual NullableStrType operator()(std::string const&) const = 0;
74 
75  virtual NullableStrType operator()(const std::string& str1,
76  const std::string& str2) const {
77  UNREACHABLE() << "operator(str1, str2) not allowed for this method";
78  // Make compiler happy
79  return NullableStrType();
80  }
81 
82  virtual NullableStrType operator()() const {
83  CHECK(hasVarStringLiteral());
84  if (var_str_literal_.is_null) {
85  return var_str_literal_;
86  }
87  return operator()(var_str_literal_.str);
88  }
89 
90  virtual Datum numericEval(const std::string_view str) const {
91  UNREACHABLE() << "numericEval not allowed for this method";
92  // Make compiler happy
93  return NullDatum(SQLTypeInfo());
94  }
95 
96  virtual Datum numericEval() const {
97  CHECK(hasVarStringLiteral());
98  if (var_str_literal_.is_null) {
99  return NullDatum(return_ti_);
100  }
101  return numericEval(var_str_literal_.str);
102  }
103 
104  virtual const SQLTypeInfo& getReturnType() const { return return_ti_; }
105 
106  const std::string& getVarStringLiteral() const {
107  CHECK(hasVarStringLiteral());
108  return var_str_literal_.str;
109  }
110 
111  bool hasVarStringLiteral() const { return has_var_str_literal_; }
112 
113  protected:
114  static boost::regex generateRegex(const std::string& op_name,
115  const std::string& regex_pattern,
116  const std::string& regex_params,
117  const bool supports_sub_matches);
118 
119  const SqlStringOpKind op_kind_;
120  const SQLTypeInfo return_ti_;
121  const bool has_var_str_literal_{false};
122  const NullableStrType var_str_literal_;
123 };
124 
125 struct TryStringCast : public StringOp {
126  public:
127  TryStringCast(const SQLTypeInfo& return_ti,
128  const std::optional<std::string>& var_str_optional_literal)
129  : StringOp(SqlStringOpKind::TRY_STRING_CAST, return_ti, var_str_optional_literal) {}
130 
131  NullableStrType operator()(const std::string& str) const override;
132  Datum numericEval(const std::string_view str) const override;
133 };
134 
135 struct Position : public StringOp {
136  public:
137  Position(const std::optional<std::string>& var_str_optional_literal,
138  const std::string& search_str)
139  : StringOp(SqlStringOpKind::POSITION,
141  var_str_optional_literal)
142  , search_str_(search_str)
143  , start_(0) {}
144 
145  Position(const std::optional<std::string>& var_str_optional_literal,
146  const std::string& search_str,
147  const int64_t start)
148  : StringOp(SqlStringOpKind::POSITION,
150  var_str_optional_literal)
151  , search_str_(search_str)
152  , start_(start > 0 ? start - 1 : start) {}
153 
154  NullableStrType operator()(const std::string& str) const override;
155  Datum numericEval(const std::string_view str) const override;
156 
157  private:
158  const std::string search_str_;
159  const int64_t start_;
160 };
161 
162 struct Lower : public StringOp {
163  Lower(const std::optional<std::string>& var_str_optional_literal)
164  : StringOp(SqlStringOpKind::LOWER, var_str_optional_literal) {}
165 
166  NullableStrType operator()(const std::string& str) const override;
167 };
168 
169 struct Upper : public StringOp {
170  Upper(const std::optional<std::string>& var_str_optional_literal)
171  : StringOp(SqlStringOpKind::UPPER, var_str_optional_literal) {}
172  NullableStrType operator()(const std::string& str) const override;
173 };
174 
175 inline std::bitset<256> build_char_bitmap(const std::string& chars_to_set) {
176  std::bitset<256> char_bitmap;
177  for (const auto& str_char : chars_to_set) {
178  char_bitmap.set(str_char);
179  }
180  return char_bitmap;
181 }
182 
183 struct InitCap : public StringOp {
184  InitCap(const std::optional<std::string>& var_str_optional_literal)
185  : StringOp(SqlStringOpKind::INITCAP, var_str_optional_literal)
186  , delimiter_bitmap_(build_char_bitmap(InitCap::delimiter_chars)) {}
187 
188  NullableStrType operator()(const std::string& str) const override;
189 
190  private:
191  static constexpr char const* delimiter_chars = R"(!?@"^#$&~_,.:;+-*%/|\[](){}<>)";
192  const std::bitset<256> delimiter_bitmap_;
193 };
194 
195 struct Reverse : public StringOp {
196  Reverse(const std::optional<std::string>& var_str_optional_literal)
197  : StringOp(SqlStringOpKind::REVERSE, var_str_optional_literal) {}
198 
199  NullableStrType operator()(const std::string& str) const override;
200 };
201 
202 struct Repeat : public StringOp {
203  public:
204  Repeat(const std::optional<std::string>& var_str_optional_literal, const int64_t n)
205  : StringOp(SqlStringOpKind::REPEAT, var_str_optional_literal)
206  , n_(n >= 0 ? n : 0UL) {
207  if (n < 0) {
208  throw std::runtime_error("Number of repeats must be >= 0");
209  }
210  }
211 
212  NullableStrType operator()(const std::string& str) const override;
213 
214  private:
215  const size_t n_;
216 };
217 
218 struct Concat : public StringOp {
219  Concat(const std::optional<std::string>& var_str_optional_literal,
220  const std::string& str_literal,
221  const bool reverse_order)
222  : StringOp(reverse_order ? SqlStringOpKind::RCONCAT : SqlStringOpKind::CONCAT,
223  var_str_optional_literal)
224  , str_literal_(str_literal)
225  , reverse_order_(reverse_order) {}
226 
227  Concat(const std::optional<std::string>& var_str_optional_literal)
228  : StringOp(SqlStringOpKind::CONCAT, var_str_optional_literal)
229  , reverse_order_(false) {}
230 
231  NullableStrType operator()(const std::string& str) const override;
232 
233  NullableStrType operator()(const std::string& str1, const std::string& str2) const;
234 
235  const std::string str_literal_;
236  const bool reverse_order_;
237 };
238 
239 struct Pad : public StringOp {
240  public:
241  enum class PadMode { LEFT, RIGHT };
242 
243  Pad(const std::optional<std::string>& var_str_optional_literal,
244  const SqlStringOpKind op_kind,
245  const int64_t padded_length,
246  const std::string& padding_string)
247  : StringOp(op_kind, var_str_optional_literal)
248  , pad_mode_(Pad::op_kind_to_pad_mode(op_kind))
249  , padded_length_(static_cast<size_t>(padded_length))
250  , padding_string_(padding_string.empty() ? " " : padding_string)
251  , padding_string_length_(padding_string.size())
252  , padding_char_(padding_string.empty() ? ' ' : padding_string[0]) {}
253 
254  NullableStrType operator()(const std::string& str) const override;
255 
256  private:
257  std::string lpad(const std::string& str) const;
258 
259  std::string rpad(const std::string& str) const;
260 
261  static PadMode op_kind_to_pad_mode(const SqlStringOpKind op_kind);
262 
263  const PadMode pad_mode_;
264  const size_t padded_length_;
265  const std::string padding_string_;
266  const size_t padding_string_length_;
267  const char padding_char_;
268 };
269 
270 struct Trim : public StringOp {
271  public:
272  enum class TrimMode { LEFT, RIGHT, BOTH };
273 
274  Trim(const std::optional<std::string>& var_str_optional_literal,
275  const SqlStringOpKind op_kind,
276  const std::string& trim_chars)
277  : StringOp(op_kind, var_str_optional_literal)
278  , trim_mode_(Trim::op_kind_to_trim_mode(op_kind))
279  , trim_char_bitmap_(build_char_bitmap(trim_chars.empty() ? " " : trim_chars)) {}
280 
281  NullableStrType operator()(const std::string& str) const override;
282 
283  private:
284  static TrimMode op_kind_to_trim_mode(const SqlStringOpKind op_kind);
285 
286  const TrimMode trim_mode_;
287  const std::bitset<256> trim_char_bitmap_;
288 };
289 
290 struct Substring : public StringOp {
291  // First constructor is for CALCITE SUBSTRING(str FROM start_pos),
292  // which returns the substring of str from start_pos
293  // until the end of the string
294  // Note start_pos is 1-indexed, unless input is 0
295 
296  Substring(const std::optional<std::string>& var_str_optional_literal,
297  const int64_t start)
298  : StringOp(SqlStringOpKind::SUBSTRING, var_str_optional_literal)
299  , start_(start > 0 ? start - 1 : start)
300  , length_(std::string::npos) {}
301 
302  // Second constructor is for CALCITE
303  // SUBSTRING(str FROM start_pos FOR length),
304  // which copies from start_pos for length characters
305  // Note start_pos is 1-indexed, unless input is 0
306  Substring(const std::optional<std::string>& var_str_optional_literal,
307  const int64_t start,
308  const int64_t length)
309  : StringOp(SqlStringOpKind::SUBSTRING, var_str_optional_literal)
310  , start_(start > 0 ? start - 1 : start)
311  , length_(static_cast<size_t>(length >= 0 ? length : 0)) {}
312 
313  NullableStrType operator()(const std::string& str) const override;
314 
315  // Make string_view version?
316  const int64_t start_;
317  const size_t length_;
318 };
319 
320 struct Overlay : public StringOp {
321  Overlay(const std::optional<std::string>& var_str_optional_literal,
322  const std::string& insert_str,
323  const int64_t start)
324  : StringOp(SqlStringOpKind::OVERLAY, var_str_optional_literal)
325  , insert_str_(insert_str)
326  , start_(start > 0 ? start - 1 : start)
327  , replacement_length_(insert_str_.size()) {}
328 
329  Overlay(const std::optional<std::string>& var_str_optional_literal,
330  const std::string& insert_str,
331  const int64_t start,
332  const int64_t replacement_length)
333  : StringOp(SqlStringOpKind::OVERLAY, var_str_optional_literal)
334  , insert_str_(insert_str)
335  , start_(start > 0 ? start - 1 : start)
336  , replacement_length_(
337  static_cast<size_t>(replacement_length >= 0 ? replacement_length : 0)) {}
338 
339  NullableStrType operator()(const std::string& base_str) const override;
340 
341  // Make string_view version?
342  const std::string insert_str_;
343  const int64_t start_;
344  const size_t replacement_length_;
345 };
346 
347 struct Replace : public StringOp {
348  Replace(const std::optional<std::string>& var_str_optional_literal,
349  const std::string& pattern_str,
350  const std::string& replacement_str)
351  : StringOp(SqlStringOpKind::REPLACE, var_str_optional_literal)
352  , pattern_str_(pattern_str)
353  , replacement_str_(replacement_str)
354  , pattern_str_len_(pattern_str.size())
355  , replacement_str_len_(replacement_str.size()) {}
356 
357  NullableStrType operator()(const std::string& str) const override;
358 
359  const std::string pattern_str_;
360  const std::string replacement_str_;
361  const size_t pattern_str_len_;
362  const size_t replacement_str_len_;
363 };
364 
365 struct SplitPart : public StringOp {
366  SplitPart(const std::optional<std::string>& var_str_optional_literal,
367  const std::string& delimiter,
368  const int64_t split_part)
369  : StringOp(SqlStringOpKind::SPLIT_PART, var_str_optional_literal)
370  , delimiter_(delimiter)
371  , split_part_(split_part == 0 ? 1UL : std::abs(split_part))
372  , delimiter_length_(delimiter.size())
373  , reverse_(split_part < 0) {}
374 
375  NullableStrType operator()(const std::string& str) const override;
376 
377  // Make string_view version?
378 
379  const std::string delimiter_;
380  const size_t split_part_;
381  const size_t delimiter_length_;
382  const bool reverse_;
383 };
384 
385 struct RegexpSubstr : public StringOp {
386  public:
387  RegexpSubstr(const std::optional<std::string>& var_str_optional_literal,
388  const std::string& regex_pattern,
389  const int64_t start_pos,
390  const int64_t occurrence,
391  const std::string& regex_params,
392  const int64_t sub_match_group_idx)
393  : StringOp(SqlStringOpKind::REGEXP_SUBSTR, var_str_optional_literal)
394  , regex_pattern_str_(
395  regex_pattern) // for toString() as std::regex does not have str() method
396  , regex_pattern_(
397  StringOp::generateRegex("REGEXP_SUBSTR", regex_pattern, regex_params, true))
398  , start_pos_(start_pos > 0 ? start_pos - 1 : start_pos)
399  , occurrence_(occurrence > 0 ? occurrence - 1 : occurrence)
400  , sub_match_info_(set_sub_match_info(regex_params, sub_match_group_idx)) {}
401 
402  NullableStrType operator()(const std::string& str) const override;
403 
404  private:
405  static std::string get_sub_match(const boost::smatch& match,
406  const std::pair<bool, int64_t> sub_match_info);
407 
408  static std::pair<bool, int64_t> set_sub_match_info(const std::string& regex_pattern,
409  const int64_t sub_match_group_idx);
410 
411  const std::string regex_pattern_str_;
412  const boost::regex regex_pattern_;
413  const int64_t start_pos_;
414  const int64_t occurrence_;
415  const std::pair<bool, int64_t> sub_match_info_;
416 };
417 
418 struct RegexpReplace : public StringOp {
419  public:
420  RegexpReplace(const std::optional<std::string>& var_str_optional_literal,
421  const std::string& regex_pattern,
422  const std::string& replacement,
423  const int64_t start_pos,
424  const int64_t occurrence,
425  const std::string& regex_params)
426  : StringOp(SqlStringOpKind::REGEXP_REPLACE, var_str_optional_literal)
427  , regex_pattern_str_(
428  regex_pattern) // for toString() as std::regex does not have str() method
429  , regex_pattern_(
430  StringOp::generateRegex("REGEXP_REPLACE", regex_pattern, regex_params, false))
431  , replacement_(replacement)
432  , start_pos_(start_pos > 0 ? start_pos - 1 : start_pos)
433  , occurrence_(occurrence) {}
434 
435  NullableStrType operator()(const std::string& str) const override;
436 
437  private:
438  static std::pair<size_t, size_t> get_nth_regex_match(const std::string& str,
439  const size_t start_pos,
440  const boost::regex& regex_pattern,
441  const int64_t occurrence);
442 
443  const std::string regex_pattern_str_;
444  const boost::regex regex_pattern_;
445  const std::string replacement_;
446  const int64_t start_pos_;
447  const int64_t occurrence_;
448 };
449 
450 // We currently do not allow strict mode JSON parsing per the SQL standard, as
451 // 1) We can't throw run-time errors in the case that the string operator
452 // is evaluated in an actual kernel, which is the case for none-encoded text
453 // inputs, and would need to capture the parsing and key errors and set
454 // kernel error flags accordingly. Currently throwing an error in even a CPU
455 // kernel will crash the server as it's not caught (by design, as executor kernels
456 // use error codes so that GPU and CPU code can throw errors).
457 // 2) When JSON_VALUE (or other not-yet-implemented JSON operators) is run over
458 // a string dictionary, if the column shares a dictionary such that the dictionary
459 // contains entries not in the column, we can throw errors for fields not in the
460 // actual column, as we compute JSON_VALUE for all values in the dictionary
461 // pre-kernel launch to build the string dictionary translation map. Since the
462 // offending values may not actually be in the column (when it references a
463 // shared dict), there is not even a way for the user to filter out or
464 // case-guard the offending values
465 // Todo(todd): Implement proper error infra for StringOps, both for the
466 // none-encoded and dictionary encoded paths
467 
468 struct JsonValue : public StringOp {
469  public:
470  JsonValue(const std::optional<std::string>& var_str_optional_literal,
471  const std::string& json_path)
472  : StringOp(SqlStringOpKind::JSON_VALUE, var_str_optional_literal)
473  , json_parse_mode_(parse_json_parse_mode(json_path))
474  , json_keys_(parse_json_path(json_path)) {}
475 
476  NullableStrType operator()(const std::string& str) const override;
477 
478  private:
479  enum class JsonKeyKind { JSON_OBJECT, JSON_ARRAY };
480  enum class JsonParseMode { PARSE_MODE_LAX, PARSE_MODE_STRICT };
481 
482  struct JsonKey {
483  JsonKeyKind key_kind;
484  std::string object_key;
485  // Todo (todd): Support array ranges ala SQL Server
486  size_t array_key;
487 
488  JsonKey(const std::string& object_key)
489  : key_kind(JsonKeyKind::JSON_OBJECT), object_key(object_key) {}
490  JsonKey(const size_t array_key)
491  : key_kind(JsonKeyKind::JSON_ARRAY), array_key(array_key) {}
492  };
493 
494  static JsonParseMode parse_json_parse_mode(std::string_view json_path);
495  static std::vector<JsonKey> parse_json_path(const std::string& json_path);
496  inline NullableStrType handle_parse_error(const std::string& json_str) const {
497  if (json_parse_mode_ == JsonParseMode::PARSE_MODE_LAX) {
498  return NullableStrType();
499  } else {
500  throw std::runtime_error("Could not parse: " + json_str + ".");
501  }
502  }
503 
504  inline NullableStrType handle_key_error(const std::string& json_str) const {
505  if (json_parse_mode_ == JsonParseMode::PARSE_MODE_LAX) {
506  return NullableStrType();
507  } else {
508  throw std::runtime_error("Key not found or did not contain value in: " + json_str +
509  ".");
510  }
511  }
512  static constexpr bool allow_strict_json_parsing{false};
513  const JsonParseMode json_parse_mode_; // If PARSE_MODE_LAX return null and don't throw
514  // error on parsing error
515  const std::vector<JsonKey> json_keys_;
516 };
517 
518 struct Base64Encode : public StringOp {
519  Base64Encode(const std::optional<std::string>& var_str_optional_literal)
520  : StringOp(SqlStringOpKind::BASE64_ENCODE, var_str_optional_literal) {}
521 
522  NullableStrType operator()(const std::string& str) const override;
523 };
524 
525 struct Base64Decode : public StringOp {
526  Base64Decode(const std::optional<std::string>& var_str_optional_literal)
527  : StringOp(SqlStringOpKind::BASE64_DECODE, var_str_optional_literal) {}
528 
529  NullableStrType operator()(const std::string& str) const override;
530 };
531 
532 struct NullOp : public StringOp {
533  NullOp(const std::optional<std::string>& var_str_optional_literal,
534  const SqlStringOpKind op_kind)
535  : StringOp(SqlStringOpKind::INVALID, var_str_optional_literal), op_kind_(op_kind) {}
536 
537  NullableStrType operator()(const std::string& str) const {
538  return NullableStrType(); // null string
539  }
540 
541  const SqlStringOpKind op_kind_;
542 };
543 
544 std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info);
545 
546 std::pair<std::string, bool /* is null */> apply_string_op_to_literals(
547  const StringOpInfo& string_op_info);
548 
549 Datum apply_numeric_op_to_literals(const StringOpInfo& string_op_info);
550 
551 class StringOps {
552  public:
553  StringOps() : string_ops_(genStringOpsFromOpInfos({})), num_ops_(0UL) {}
554 
555  StringOps(const std::vector<StringOpInfo>& string_op_infos)
556  : string_ops_(genStringOpsFromOpInfos(string_op_infos))
557  , num_ops_(string_op_infos.size()) {}
558 
559  std::string operator()(const std::string& str) const;
560 
561  std::string multi_input_eval(const std::string_view str1,
562  const std::string_view str2) const;
563 
564  std::string_view operator()(const std::string_view sv, std::string& sv_storage) const;
565 
566  Datum numericEval(const std::string_view str) const;
567 
568  size_t size() const { return num_ops_; }
569 
570  private:
571  std::vector<std::unique_ptr<const StringOp>> genStringOpsFromOpInfos(
572  const std::vector<StringOpInfo>& string_op_infos) const;
573 
574  const std::vector<std::unique_ptr<const StringOp>> string_ops_;
575  const size_t num_ops_;
576 };
577 
578 } // namespace StringOps_Namespace
Datum apply_numeric_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:916
const std::string json_str(const rapidjson::Value &obj) noexcept
Definition: JsonAccessors.h:44
#define UNREACHABLE()
Definition: Logger.h:266
SqlStringOpKind
Definition: sqldefs.h:88
Constants for Builtin SQL Types supported by HEAVY.AI.
CONSTEXPR DEVICE bool is_null(const T &value)
std::pair< std::string, bool > apply_string_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:905
bool g_enable_smem_group_by true
Datum NullDatum(const SQLTypeInfo &ti)
Definition: Datum.cpp:286
Definition: sqltypes.h:67
bool g_enable_watchdog false
Definition: Execute.cpp:79
#define CHECK(condition)
Definition: Logger.h:222
Common Enum definitions for SQL processing.
constexpr double n
Definition: Utm.h:38
Definition: Datum.h:44
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:715