OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOps.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "Logger/Logger.h"
21 #include "Shared/sqldefs.h"
22 #include "Shared/sqltypes.h"
23 #include "StringOpInfo.h"
24 
25 #include <algorithm>
26 #include <bitset>
27 #include <cctype>
28 #include <cmath>
29 #include <map>
30 #include <memory>
31 #include <optional>
32 #include <string>
33 #include <string_view>
34 #include <utility>
35 #include <vector>
36 
37 namespace StringOps_Namespace {
38 
39 struct NullableStrType {
40  NullableStrType(const std::string& str) : str(str), is_null(false) {}
41  NullableStrType(const std::string_view& sv) : str(sv), is_null(false) {}
42  NullableStrType() : is_null(true) {}
43 
44  std::pair<std::string, bool> toPair() const { return {str, is_null}; }
45 
46  std::string str;
47  bool is_null;
48 };
49 
50 struct StringOp {
51  public:
52  StringOp(const SqlStringOpKind op_kind,
53  const std::optional<std::string>& var_str_optional_literal)
54  : op_kind_(op_kind)
55  , has_var_str_literal_(var_str_optional_literal.has_value())
56  , var_str_literal_(!var_str_optional_literal.has_value()
57  ? NullableStrType()
58  : NullableStrType(var_str_optional_literal.value())) {}
59 
60  virtual ~StringOp() = default;
61 
62  std::string opName() const { return ::toString(op_kind_); }
63 
64  virtual NullableStrType operator()(std::string const&) const = 0;
65 
66  virtual NullableStrType operator()() const {
67  CHECK(hasVarStringLiteral());
68  if (var_str_literal_.is_null) {
69  return var_str_literal_;
70  }
71  return operator()(var_str_literal_.str);
72  }
73 
74  const std::string& getVarStringLiteral() const {
75  CHECK(hasVarStringLiteral());
76  return var_str_literal_.str;
77  }
78 
79  bool hasVarStringLiteral() const { return has_var_str_literal_; }
80 
81  protected:
82  static boost::regex generateRegex(const std::string& op_name,
83  const std::string& regex_pattern,
84  const std::string& regex_params,
85  const bool supports_sub_matches);
86 
87  const SqlStringOpKind op_kind_;
88  const bool has_var_str_literal_{false};
89  const NullableStrType var_str_literal_;
90 };
91 
92 struct Lower : public StringOp {
93  Lower(const std::optional<std::string>& var_str_optional_literal)
94  : StringOp(SqlStringOpKind::LOWER, var_str_optional_literal) {}
95 
96  NullableStrType operator()(const std::string& str) const override;
97 };
98 
99 struct Upper : public StringOp {
100  Upper(const std::optional<std::string>& var_str_optional_literal)
101  : StringOp(SqlStringOpKind::UPPER, var_str_optional_literal) {}
102  NullableStrType operator()(const std::string& str) const override;
103 };
104 
105 inline std::bitset<256> build_char_bitmap(const std::string& chars_to_set) {
106  std::bitset<256> char_bitmap;
107  for (const auto& str_char : chars_to_set) {
108  char_bitmap.set(str_char);
109  }
110  return char_bitmap;
111 }
112 
113 struct InitCap : public StringOp {
114  InitCap(const std::optional<std::string>& var_str_optional_literal)
115  : StringOp(SqlStringOpKind::INITCAP, var_str_optional_literal)
116  , delimiter_bitmap_(build_char_bitmap(InitCap::delimiter_chars)) {}
117 
118  NullableStrType operator()(const std::string& str) const override;
119 
120  private:
121  static constexpr char const* delimiter_chars = R"(!?@"^#$&~_,.:;+-*%/|\[](){}<>)";
122  const std::bitset<256> delimiter_bitmap_;
123 };
124 
125 struct Reverse : public StringOp {
126  Reverse(const std::optional<std::string>& var_str_optional_literal)
127  : StringOp(SqlStringOpKind::REVERSE, var_str_optional_literal) {}
128 
129  NullableStrType operator()(const std::string& str) const override;
130 };
131 
132 struct Repeat : public StringOp {
133  public:
134  Repeat(const std::optional<std::string>& var_str_optional_literal, const int64_t n)
135  : StringOp(SqlStringOpKind::REPEAT, var_str_optional_literal)
136  , n_(n >= 0 ? n : 0UL) {
137  if (n < 0) {
138  throw std::runtime_error("Number of repeats must be >= 0");
139  }
140  }
141 
142  NullableStrType operator()(const std::string& str) const override;
143 
144  private:
145  const size_t n_;
146 };
147 
148 struct Concat : public StringOp {
149  Concat(const std::optional<std::string>& var_str_optional_literal,
150  const std::string& str_literal,
151  const bool reverse_order)
152  : StringOp(reverse_order ? SqlStringOpKind::RCONCAT : SqlStringOpKind::CONCAT,
153  var_str_optional_literal)
154  , str_literal_(str_literal)
155  , reverse_order_(reverse_order) {}
156 
157  NullableStrType operator()(const std::string& str) const override;
158 
159  const std::string str_literal_;
160  const bool reverse_order_;
161 };
162 
163 struct Pad : public StringOp {
164  public:
165  enum class PadMode { LEFT, RIGHT };
166 
167  Pad(const std::optional<std::string>& var_str_optional_literal,
168  const SqlStringOpKind op_kind,
169  const int64_t padded_length,
170  const std::string& padding_string)
171  : StringOp(op_kind, var_str_optional_literal)
172  , pad_mode_(Pad::op_kind_to_pad_mode(op_kind))
173  , padded_length_(static_cast<size_t>(padded_length))
174  , padding_string_(padding_string.empty() ? " " : padding_string)
175  , padding_string_length_(padding_string.size())
176  , padding_char_(padding_string.empty() ? ' ' : padding_string[0]) {}
177 
178  NullableStrType operator()(const std::string& str) const override;
179 
180  private:
181  std::string lpad(const std::string& str) const;
182 
183  std::string rpad(const std::string& str) const;
184 
185  static PadMode op_kind_to_pad_mode(const SqlStringOpKind op_kind);
186 
187  const PadMode pad_mode_;
188  const size_t padded_length_;
189  const std::string padding_string_;
190  const size_t padding_string_length_;
191  const char padding_char_;
192 };
193 
194 struct Trim : public StringOp {
195  public:
196  enum class TrimMode { LEFT, RIGHT, BOTH };
197 
198  Trim(const std::optional<std::string>& var_str_optional_literal,
199  const SqlStringOpKind op_kind,
200  const std::string& trim_chars)
201  : StringOp(op_kind, var_str_optional_literal)
202  , trim_mode_(Trim::op_kind_to_trim_mode(op_kind))
203  , trim_char_bitmap_(build_char_bitmap(trim_chars.empty() ? " " : trim_chars)) {}
204 
205  NullableStrType operator()(const std::string& str) const override;
206 
207  private:
208  static TrimMode op_kind_to_trim_mode(const SqlStringOpKind op_kind);
209 
210  const TrimMode trim_mode_;
211  const std::bitset<256> trim_char_bitmap_;
212 };
213 
214 struct Substring : public StringOp {
215  // First constructor is for CALCITE SUBSTRING(str FROM start_pos),
216  // which returns the substring of str from start_pos
217  // until the end of the string
218  // Note start_pos is 1-indexed, unless input is 0
219 
220  Substring(const std::optional<std::string>& var_str_optional_literal,
221  const int64_t start)
222  : StringOp(SqlStringOpKind::SUBSTRING, var_str_optional_literal)
223  , start_(start > 0 ? start - 1 : start)
224  , length_(std::string::npos) {}
225 
226  // Second constructor is for CALCITE
227  // SUBSTRING(str FROM start_pos FOR length),
228  // which copies from start_pos for length characters
229  // Note start_pos is 1-indexed, unless input is 0
230  Substring(const std::optional<std::string>& var_str_optional_literal,
231  const int64_t start,
232  const int64_t length)
233  : StringOp(SqlStringOpKind::SUBSTRING, var_str_optional_literal)
234  , start_(start > 0 ? start - 1 : start)
235  , length_(static_cast<size_t>(length >= 0 ? length : 0)) {}
236 
237  NullableStrType operator()(const std::string& str) const override;
238 
239  // Make string_view version?
240  const int64_t start_;
241  const size_t length_;
242 };
243 
244 struct Overlay : public StringOp {
245  Overlay(const std::optional<std::string>& var_str_optional_literal,
246  const std::string& insert_str,
247  const int64_t start)
248  : StringOp(SqlStringOpKind::OVERLAY, var_str_optional_literal)
249  , insert_str_(insert_str)
250  , start_(start > 0 ? start - 1 : start)
251  , replacement_length_(insert_str_.size()) {}
252 
253  Overlay(const std::optional<std::string>& var_str_optional_literal,
254  const std::string& insert_str,
255  const int64_t start,
256  const int64_t replacement_length)
257  : StringOp(SqlStringOpKind::OVERLAY, var_str_optional_literal)
258  , insert_str_(insert_str)
259  , start_(start > 0 ? start - 1 : start)
260  , replacement_length_(
261  static_cast<size_t>(replacement_length >= 0 ? replacement_length : 0)) {}
262 
263  NullableStrType operator()(const std::string& base_str) const override;
264 
265  // Make string_view version?
266  const std::string insert_str_;
267  const int64_t start_;
268  const size_t replacement_length_;
269 };
270 
271 struct Replace : public StringOp {
272  Replace(const std::optional<std::string>& var_str_optional_literal,
273  const std::string& pattern_str,
274  const std::string& replacement_str)
275  : StringOp(SqlStringOpKind::REPLACE, var_str_optional_literal)
276  , pattern_str_(pattern_str)
277  , replacement_str_(replacement_str)
278  , pattern_str_len_(pattern_str.size())
279  , replacement_str_len_(replacement_str.size()) {}
280 
281  NullableStrType operator()(const std::string& str) const override;
282 
283  const std::string pattern_str_;
284  const std::string replacement_str_;
285  const size_t pattern_str_len_;
286  const size_t replacement_str_len_;
287 };
288 
289 struct SplitPart : public StringOp {
290  SplitPart(const std::optional<std::string>& var_str_optional_literal,
291  const std::string& delimiter,
292  const int64_t split_part)
293  : StringOp(SqlStringOpKind::SPLIT_PART, var_str_optional_literal)
294  , delimiter_(delimiter)
295  , split_part_(split_part == 0 ? 1UL : std::abs(split_part))
296  , delimiter_length_(delimiter.size())
297  , reverse_(split_part < 0) {}
298 
299  NullableStrType operator()(const std::string& str) const override;
300 
301  // Make string_view version?
302 
303  const std::string delimiter_;
304  const size_t split_part_;
305  const size_t delimiter_length_;
306  const bool reverse_;
307 };
308 
309 struct RegexpSubstr : public StringOp {
310  public:
311  RegexpSubstr(const std::optional<std::string>& var_str_optional_literal,
312  const std::string& regex_pattern,
313  const int64_t start_pos,
314  const int64_t occurrence,
315  const std::string& regex_params,
316  const int64_t sub_match_group_idx)
317  : StringOp(SqlStringOpKind::REGEXP_SUBSTR, var_str_optional_literal)
318  , regex_pattern_str_(
319  regex_pattern) // for toString() as std::regex does not have str() method
320  , regex_pattern_(
321  StringOp::generateRegex("REGEXP_SUBSTR", regex_pattern, regex_params, true))
322  , start_pos_(start_pos > 0 ? start_pos - 1 : start_pos)
323  , occurrence_(occurrence > 0 ? occurrence - 1 : occurrence)
324  , sub_match_info_(set_sub_match_info(regex_params, sub_match_group_idx)) {}
325 
326  NullableStrType operator()(const std::string& str) const override;
327 
328  private:
329  static std::string get_sub_match(const boost::smatch& match,
330  const std::pair<bool, int64_t> sub_match_info);
331 
332  static std::pair<bool, int64_t> set_sub_match_info(const std::string& regex_pattern,
333  const int64_t sub_match_group_idx);
334 
335  const std::string regex_pattern_str_;
336  const boost::regex regex_pattern_;
337  const int64_t start_pos_;
338  const int64_t occurrence_;
339  const std::pair<bool, int64_t> sub_match_info_;
340 };
341 
342 struct RegexpReplace : public StringOp {
343  public:
344  RegexpReplace(const std::optional<std::string>& var_str_optional_literal,
345  const std::string& regex_pattern,
346  const std::string& replacement,
347  const int64_t start_pos,
348  const int64_t occurrence,
349  const std::string& regex_params)
350  : StringOp(SqlStringOpKind::REGEXP_REPLACE, var_str_optional_literal)
351  , regex_pattern_str_(
352  regex_pattern) // for toString() as std::regex does not have str() method
353  , regex_pattern_(
354  StringOp::generateRegex("REGEXP_REPLACE", regex_pattern, regex_params, false))
355  , replacement_(replacement)
356  , start_pos_(start_pos > 0 ? start_pos - 1 : start_pos)
357  , occurrence_(occurrence) {}
358 
359  NullableStrType operator()(const std::string& str) const override;
360 
361  private:
362  static std::pair<size_t, size_t> get_nth_regex_match(const std::string& str,
363  const size_t start_pos,
364  const boost::regex& regex_pattern,
365  const int64_t occurrence);
366 
367  const std::string regex_pattern_str_;
368  const boost::regex regex_pattern_;
369  const std::string replacement_;
370  const int64_t start_pos_;
371  const int64_t occurrence_;
372 };
373 
374 struct NullOp : public StringOp {
375  NullOp(const std::optional<std::string>& var_str_optional_literal,
376  const SqlStringOpKind op_kind)
377  : StringOp(SqlStringOpKind::INVALID, var_str_optional_literal), op_kind_(op_kind) {}
378 
379  NullableStrType operator()(const std::string& str) const {
380  return NullableStrType(); // null string
381  }
382 
383  const SqlStringOpKind op_kind_;
384 };
385 
386 std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info);
387 
388 std::pair<std::string, bool /* is null */> apply_string_op_to_literals(
389  const StringOpInfo& string_op_info);
390 
391 class StringOps {
392  public:
393  StringOps() : string_ops_(genStringOpsFromOpInfos({})), num_ops_(0UL) {}
394 
395  StringOps(const std::vector<StringOpInfo>& string_op_infos)
396  : string_ops_(genStringOpsFromOpInfos(string_op_infos))
397  , num_ops_(string_op_infos.size()) {}
398 
399  std::string operator()(const std::string& str) const;
400 
401  std::string_view operator()(const std::string_view& sv, std::string& sv_storage) const;
402 
403  size_t size() const { return num_ops_; }
404 
405  private:
406  std::vector<std::unique_ptr<const StringOp>> genStringOpsFromOpInfos(
407  const std::vector<StringOpInfo>& string_op_infos) const;
408 
409  const std::vector<std::unique_ptr<const StringOp>> string_ops_;
410  const size_t num_ops_;
411 };
412 
413 } // namespace StringOps_Namespace
SqlStringOpKind
Definition: sqldefs.h:84
Constants for Builtin SQL Types supported by HEAVY.AI.
CONSTEXPR DEVICE bool is_null(const T &value)
std::pair< std::string, bool > apply_string_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:617
bool g_enable_smem_group_by true
std::string toString(const Executor::ExtModuleKinds &kind)
Definition: Execute.h:1448
bool g_enable_watchdog false
Definition: Execute.cpp:79
#define CHECK(condition)
Definition: Logger.h:222
Common Enum definitions for SQL processing.
constexpr double n
Definition: Utm.h:38
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:467