OmniSciDB  ca0c39ec8f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOpsIR.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
19 
20 #include "../Shared/funcannotations.h"
21 #include "../Shared/sqldefs.h"
22 #include "Parser/ParserNode.h"
24 #include "StringOps/StringOps.h"
25 
26 #include <boost/locale/conversion.hpp>
27 
28 extern "C" RUNTIME_EXPORT uint64_t string_decode(int8_t* chunk_iter_, int64_t pos) {
29  auto chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
30  VarlenDatum vd;
31  bool is_end;
32  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
33  CHECK(!is_end);
34  return vd.is_null ? 0
35  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
36  (static_cast<uint64_t>(vd.length) << 48);
37 }
38 
39 extern "C" RUNTIME_EXPORT uint64_t string_decompress(const int32_t string_id,
40  const int64_t string_dict_handle) {
41  if (string_id == NULL_INT) {
42  return 0;
43  }
44  auto string_dict_proxy =
45  reinterpret_cast<const StringDictionaryProxy*>(string_dict_handle);
46  auto string_bytes = string_dict_proxy->getStringBytes(string_id);
47  CHECK(string_bytes.first);
48  return (reinterpret_cast<uint64_t>(string_bytes.first) & 0xffffffffffff) |
49  (static_cast<uint64_t>(string_bytes.second) << 48);
50 }
51 
52 extern "C" RUNTIME_EXPORT int32_t string_compress(const int64_t ptr_and_len,
53  const int64_t string_dict_handle) {
54  std::string raw_str(reinterpret_cast<char*>(extract_str_ptr_noinline(ptr_and_len)),
55  extract_str_len_noinline(ptr_and_len));
56  if (raw_str.empty()) {
57  return inline_int_null_value<int32_t>();
58  }
59  auto string_dict_proxy = reinterpret_cast<StringDictionaryProxy*>(string_dict_handle);
60  return string_dict_proxy->getOrAddTransient(raw_str);
61 }
62 
63 extern "C" RUNTIME_EXPORT int32_t
64 apply_string_ops_and_encode(const char* str_ptr,
65  const int32_t str_len,
66  const int64_t string_ops_handle,
67  const int64_t string_dict_handle) {
68  std::string raw_str(str_ptr, str_len);
69  auto string_ops =
70  reinterpret_cast<const StringOps_Namespace::StringOps*>(string_ops_handle);
71  auto string_dict_proxy = reinterpret_cast<StringDictionaryProxy*>(string_dict_handle);
72  const auto result_str = string_ops->operator()(raw_str);
73  if (result_str.empty()) {
74  return inline_int_null_value<int32_t>();
75  }
76  return string_dict_proxy->getOrAddTransient(result_str);
77 }
78 
79 extern "C" RUNTIME_EXPORT int32_t
81  const int64_t source_string_dict_handle,
82  const int64_t dest_string_dict_handle) {
83  const auto source_string_dict_proxy =
84  reinterpret_cast<StringDictionaryProxy*>(source_string_dict_handle);
85  auto dest_string_dict_proxy =
86  reinterpret_cast<StringDictionaryProxy*>(dest_string_dict_handle);
87  // Can we have StringDictionaryProxy::getString return a reference?
88  const auto source_str = source_string_dict_proxy->getString(string_id);
89  if (source_str.empty()) {
90  return inline_int_null_value<int32_t>();
91  }
92  return dest_string_dict_proxy->getIdOfString(source_str);
93 }
94 
95 extern "C" RUNTIME_EXPORT int32_t
97  const int64_t source_string_dict_handle,
98  const int64_t dest_string_dict_handle) {
99  const auto source_string_dict_proxy =
100  reinterpret_cast<StringDictionaryProxy*>(source_string_dict_handle);
101  auto dest_string_dict_proxy =
102  reinterpret_cast<StringDictionaryProxy*>(dest_string_dict_handle);
103  // Can we have StringDictionaryProxy::getString return a reference?
104  const auto source_str = source_string_dict_proxy->getString(string_id);
105  if (source_str.empty()) {
106  return inline_int_null_value<int32_t>();
107  }
108  return dest_string_dict_proxy->getOrAddTransient(source_str);
109 }
110 
111 #define DEF_APPLY_NUMERIC_STRING_OPS(value_type, value_name) \
112  extern "C" RUNTIME_EXPORT ALWAYS_INLINE value_type \
113  apply_numeric_string_ops_##value_name( \
114  const char* str_ptr, const int32_t str_len, const int64_t string_ops_handle) { \
115  std::string raw_str(str_ptr, str_len); \
116  auto string_ops = \
117  reinterpret_cast<const StringOps_Namespace::StringOps*>(string_ops_handle); \
118  const auto result_datum = string_ops->numericEval(raw_str); \
119  return result_datum.value_name##val; \
120  }
121 
122 DEF_APPLY_NUMERIC_STRING_OPS(int8_t, bool)
123 DEF_APPLY_NUMERIC_STRING_OPS(int8_t, tinyint)
124 DEF_APPLY_NUMERIC_STRING_OPS(int16_t, smallint)
125 DEF_APPLY_NUMERIC_STRING_OPS(int32_t, int)
126 DEF_APPLY_NUMERIC_STRING_OPS(int64_t, bigint)
127 DEF_APPLY_NUMERIC_STRING_OPS(float, float)
128 DEF_APPLY_NUMERIC_STRING_OPS(double, double)
129 
130 #undef DEF_APPLY_NUMERIC_STRING_OPS
131 
133  const CompilationOptions& co) {
135  auto str_lv = codegen(expr->get_arg(), true, co);
136  if (str_lv.size() != 3) {
137  CHECK_EQ(size_t(1), str_lv.size());
138  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
139  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
140  if (co.device_type == ExecutorDeviceType::GPU) {
141  throw QueryMustRunOnCpu();
142  }
143  }
144  std::vector<llvm::Value*> charlength_args{str_lv[1], str_lv[2]};
145  std::string fn_name("char_length");
146  if (expr->get_calc_encoded_length()) {
147  fn_name += "_encoded";
148  }
149  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
150  if (is_nullable) {
151  fn_name += "_nullable";
152  charlength_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
153  }
154  return expr->get_calc_encoded_length()
155  ? cgen_state_->emitExternalCall(
156  fn_name, get_int_type(32, cgen_state_->context_), charlength_args)
157  : cgen_state_->emitCall(fn_name, charlength_args);
158 }
159 
161  const CompilationOptions& co) {
163  auto str_lv = codegen(expr->get_arg(), true, co);
164  CHECK_EQ(size_t(1), str_lv.size());
165  return cgen_state_->emitCall("key_for_string_encoded", str_lv);
166 }
167 
168 std::vector<StringOps_Namespace::StringOpInfo> getStringOpInfos(
169  const Analyzer::StringOper* expr) {
170  std::vector<StringOps_Namespace::StringOpInfo> string_op_infos;
171  auto chained_string_op_exprs = expr->getChainedStringOpExprs();
172  if (chained_string_op_exprs.empty()) {
173  // Likely will change the below to a CHECK but until we have more confidence
174  // that all potential query patterns have nodes that might contain string ops folded,
175  // leaving as an error for now
176  throw std::runtime_error(
177  "Expected folded string operator but found operator unfolded.");
178  }
179  // Consider encapsulating below in an Analyzer::StringOper method to dedup
180  for (const auto& chained_string_op_expr : chained_string_op_exprs) {
181  auto chained_string_op =
182  dynamic_cast<const Analyzer::StringOper*>(chained_string_op_expr.get());
183  CHECK(chained_string_op);
184  StringOps_Namespace::StringOpInfo string_op_info(chained_string_op->get_kind(),
185  chained_string_op->get_type_info(),
186  chained_string_op->getLiteralArgs());
187  string_op_infos.emplace_back(string_op_info);
188  }
189  return string_op_infos;
190 }
191 
193  const CompilationOptions& co) {
195  CHECK_GE(expr->getArity(), 1UL);
196 
197  const auto& expr_ti = expr->get_type_info();
198  // Should probably CHECK we have a UOper cast to dict encoded to be consistent
199  const auto primary_arg = remove_cast(expr->getArg(0));
200  CHECK(primary_arg->get_type_info().is_none_encoded_string());
201 
202  if (g_cluster) {
203  throw std::runtime_error(
204  "Cast from none-encoded string to dictionary-encoded not supported for "
205  "distributed queries");
206  }
208  throw QueryMustRunOnCpu();
209  }
210  auto primary_str_lv = codegen(primary_arg, true, co);
211  CHECK_EQ(size_t(3), primary_str_lv.size());
212  const auto string_op_infos = getStringOpInfos(expr);
213  CHECK(string_op_infos.size());
214 
215  const auto string_ops =
216  executor()->getRowSetMemoryOwner()->getStringOps(string_op_infos);
217  const int64_t string_ops_handle = reinterpret_cast<int64_t>(string_ops);
218  auto string_ops_handle_lv = cgen_state_->llInt(string_ops_handle);
219 
220  const auto& return_ti = expr->get_type_info();
221  if (!return_ti.is_string()) {
222  std::vector<llvm::Value*> string_oper_lvs{
223  primary_str_lv[1], primary_str_lv[2], string_ops_handle_lv};
224  const auto return_type = return_ti.get_type();
225  std::string fn_call = "apply_numeric_string_ops_";
226  switch (return_type) {
227  case kBOOLEAN: {
228  fn_call += "bool";
229  break;
230  }
231  case kTINYINT:
232  case kSMALLINT:
233  case kINT:
234  case kBIGINT:
235  case kFLOAT:
236  case kDOUBLE: {
237  fn_call += to_lower(toString(return_type));
238  break;
239  }
240  case kNUMERIC:
241  case kDECIMAL:
242  case kTIME:
243  case kTIMESTAMP:
244  case kDATE: {
245  fn_call += "bigint";
246  break;
247  }
248  default: {
249  throw std::runtime_error("Unimplemented type for string-to-numeric translation");
250  }
251  }
252  const auto logical_size = return_ti.get_logical_size() * 8;
253  auto llvm_return_type = return_ti.is_fp()
254  ? get_fp_type(logical_size, cgen_state_->context_)
255  : get_int_type(logical_size, cgen_state_->context_);
256  return cgen_state_->emitExternalCall(fn_call, llvm_return_type, string_oper_lvs);
257  }
258 
259  // If here we are outputing a string dictionary column
260 
261  const int64_t dest_string_proxy_handle =
262  reinterpret_cast<int64_t>(executor()->getStringDictionaryProxy(
263  expr_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true));
264  auto dest_string_proxy_handle_lv = cgen_state_->llInt(dest_string_proxy_handle);
265  std::vector<llvm::Value*> string_oper_lvs{primary_str_lv[1],
266  primary_str_lv[2],
267  string_ops_handle_lv,
268  dest_string_proxy_handle_lv};
269 
270  return cgen_state_->emitExternalCall("apply_string_ops_and_encode",
272  string_oper_lvs);
273 }
274 
275 std::unique_ptr<StringDictionaryTranslationMgr> translate_dict_strings(
276  const Analyzer::StringOper* expr,
277  const ExecutorDeviceType device_type,
278  Executor* executor) {
279  const auto& expr_ti = expr->get_type_info();
280  const auto& primary_input_expr_ti = expr->getArg(0)->get_type_info();
281  const auto dict_id = primary_input_expr_ti.get_comp_param();
282  const auto string_op_infos = getStringOpInfos(expr);
283  CHECK(string_op_infos.size());
284 
285  if (string_op_infos.back().getReturnType().is_dict_encoded_string()) {
286  // string->string translation
287  auto string_dictionary_translation_mgr =
288  std::make_unique<StringDictionaryTranslationMgr>(
289  dict_id,
290  dict_id,
291  false, // translate_intersection_only
292  expr_ti,
293  string_op_infos,
296  executor->deviceCount(device_type),
297  executor,
298  &executor->getCatalog()->getDataMgr(),
299  false /* delay_translation */);
300  return string_dictionary_translation_mgr;
301  } else {
302  // string->numeric translation
303  auto string_dictionary_translation_mgr =
304  std::make_unique<StringDictionaryTranslationMgr>(
305  dict_id,
306  expr_ti,
307  string_op_infos,
310  executor->deviceCount(device_type),
311  executor,
312  &executor->getCatalog()->getDataMgr(),
313  false /* delay_translation */);
314  return string_dictionary_translation_mgr;
315  }
316 }
317 
319  const CompilationOptions& co) {
320  CHECK_GE(expr->getArity(), 1UL);
321  if (expr->hasNoneEncodedTextArg()) {
322  return codegenPerRowStringOper(expr, co);
323  }
325 
326  const auto& expr_ti = expr->get_type_info();
327  auto string_dictionary_translation_mgr =
329 
330  auto str_id_lv = codegen(expr->getArg(0), true, co);
331  CHECK_EQ(size_t(1), str_id_lv.size());
332 
333  return cgen_state_
334  ->moveStringDictionaryTranslationMgr(std::move(string_dictionary_translation_mgr))
335  ->codegen(str_id_lv[0], expr_ti, true /* add_nullcheck */, co);
336 }
337 
338 // Method below is for join probes, as we cast the StringOper nodes to ColumnVars early to
339 // not special case that codepath (but retain the StringOpInfos, which we use here to
340 // execute the same string ops as we would on a native StringOper node)
342  const Analyzer::ColumnVar* expr,
343  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos,
344  const CompilationOptions& co) {
346  const auto& expr_ti = expr->get_type_info();
347  const auto dict_id = expr_ti.get_comp_param();
348 
349  auto string_dictionary_translation_mgr =
350  std::make_unique<StringDictionaryTranslationMgr>(
351  dict_id,
352  dict_id,
353  false, // translate_intersection_only
354  expr->get_type_info(),
355  string_op_infos,
358  executor()->deviceCount(co.device_type),
359  executor(),
360  &executor()->getCatalog()->getDataMgr(),
361  false /* delay_translation */);
362 
363  auto str_id_lv = codegen(expr, true /* fetch_column */, co);
364  CHECK_EQ(size_t(1), str_id_lv.size());
365 
366  return cgen_state_
367  ->moveStringDictionaryTranslationMgr(std::move(string_dictionary_translation_mgr))
368  ->codegen(str_id_lv[0], expr_ti, true /* add_nullcheck */, co);
369 }
370 
372  const CompilationOptions& co) {
374  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
375  throw std::runtime_error("LIKE not supported for unnested expressions");
376  }
377  char escape_char{'\\'};
378  if (expr->get_escape_expr()) {
379  auto escape_char_expr =
380  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
381  CHECK(escape_char_expr);
382  CHECK(escape_char_expr->get_type_info().is_string());
383  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
384  escape_char = (*escape_char_expr->get_constval().stringval)[0];
385  }
386  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_like_expr());
387  CHECK(pattern);
388  auto fast_dict_like_lv = codegenDictLike(expr->get_own_arg(),
389  pattern,
390  expr->get_is_ilike(),
391  expr->get_is_simple(),
392  escape_char,
393  co);
394  if (fast_dict_like_lv) {
395  return fast_dict_like_lv;
396  }
397  const auto& ti = expr->get_arg()->get_type_info();
398  CHECK(ti.is_string());
399  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
400  throw WatchdogException(
401  "Cannot do LIKE / ILIKE on this dictionary encoded column, its cardinality is "
402  "too high");
403  }
404  auto str_lv = codegen(expr->get_arg(), true, co);
405  if (str_lv.size() != 3) {
406  CHECK_EQ(size_t(1), str_lv.size());
407  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
408  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
409  if (co.device_type == ExecutorDeviceType::GPU) {
410  throw QueryMustRunOnCpu();
411  }
412  }
413  auto like_expr_arg_lvs = codegen(expr->get_like_expr(), true, co);
414  CHECK_EQ(size_t(3), like_expr_arg_lvs.size());
415  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
416  std::vector<llvm::Value*> str_like_args{
417  str_lv[1], str_lv[2], like_expr_arg_lvs[1], like_expr_arg_lvs[2]};
418  std::string fn_name{expr->get_is_ilike() ? "string_ilike" : "string_like"};
419  if (expr->get_is_simple()) {
420  fn_name += "_simple";
421  } else {
422  str_like_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
423  }
424  if (is_nullable) {
425  fn_name += "_nullable";
426  str_like_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
427  }
428  return cgen_state_->emitCall(fn_name, str_like_args);
429 }
430 
432  Executor* executor) {
433  // If here we are operating on top of one or more string functions, i.e. LOWER(str),
434  // and before running the dictionary LIKE/ILIKE or REGEXP_LIKE,
435  // we need to translate the strings first.
436 
437  // This approach is a temporary solution until we can implement the next stage
438  // of the string translation project, which will broaden the StringOper class to include
439  // operations that operate on strings but do not neccessarily return strings like
440  // LIKE/ILIKE/REGEXP_LIKE/CHAR_LENGTH At this point these aforementioned operators,
441  // including LIKE/ILIKE, will just become part of a StringOps chain (which will also
442  // avoid the overhead of serializing the transformed raw strings from previous string
443  // opers to the dictionary to only read back out and perform LIKE/ILIKE.)
444  CHECK_GT(string_oper->getArity(), 0UL);
445  const auto& string_oper_primary_arg_ti = string_oper->getArg(0)->get_type_info();
446  CHECK(string_oper_primary_arg_ti.is_dict_encoded_string());
447  CHECK_NE(string_oper_primary_arg_ti.get_comp_param(), TRANSIENT_DICT_ID);
448  // Note the actual translation below will be cached by RowSetMemOwner
449  translate_dict_strings(string_oper, ExecutorDeviceType::CPU, executor);
450 }
451 
453  const std::shared_ptr<Analyzer::Expr> like_arg,
454  const Analyzer::Constant* pattern,
455  const bool ilike,
456  const bool is_simple,
457  const char escape_char,
458  const CompilationOptions& co) {
460  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(like_arg);
461  if (!cast_oper) {
462  return nullptr;
463  }
464  CHECK(cast_oper);
465  CHECK_EQ(kCAST, cast_oper->get_optype());
466  const auto dict_like_arg = cast_oper->get_own_operand();
467  const auto& dict_like_arg_ti = dict_like_arg->get_type_info();
468  if (!dict_like_arg_ti.is_string()) {
469  throw(std::runtime_error("Cast from " + dict_like_arg_ti.get_type_name() + " to " +
470  cast_oper->get_type_info().get_type_name() +
471  " not supported"));
472  }
473  CHECK_EQ(kENCODING_DICT, dict_like_arg_ti.get_compression());
474  const auto sdp = executor()->getStringDictionaryProxy(
475  dict_like_arg_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
476  if (sdp->storageEntryCount() > 200000000) {
477  return nullptr;
478  }
479  if (sdp->getDictId() == TRANSIENT_DICT_ID) {
480  // If we have a literal dictionary it was a product
481  // of string ops applied to none-encoded strings, and
482  // will not be populated at codegen-time, so we
483  // cannot use the fast path
484 
485  // Todo(todd): Once string ops support non-string producting
486  // operators (like like/ilike), like/ilike can be chained and
487  // we can avoid the string translation
488  return nullptr;
489  }
490  const auto string_oper = dynamic_cast<const Analyzer::StringOper*>(dict_like_arg.get());
491  if (string_oper) {
492  pre_translate_string_ops(string_oper, executor());
493  }
494  const auto& pattern_ti = pattern->get_type_info();
495  CHECK(pattern_ti.is_string());
496  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
497  const auto& pattern_datum = pattern->get_constval();
498  const auto& pattern_str = *pattern_datum.stringval;
499  const auto matching_ids = sdp->getLike(pattern_str, ilike, is_simple, escape_char);
500  // InIntegerSet requires 64-bit values
501  std::vector<int64_t> matching_ids_64(matching_ids.size());
502  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
503  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
504  dict_like_arg, matching_ids_64, dict_like_arg_ti.get_notnull());
505  return codegen(in_values.get(), co);
506 }
507 
508 namespace {
509 
510 std::vector<int32_t> get_compared_ids(const StringDictionaryProxy* dict,
511  const SQLOps compare_operator,
512  const std::string& pattern) {
513  std::vector<int> ret;
514  switch (compare_operator) {
515  case kLT:
516  ret = dict->getCompare(pattern, "<");
517  break;
518  case kLE:
519  ret = dict->getCompare(pattern, "<=");
520  break;
521  case kEQ:
522  case kBW_EQ:
523  ret = dict->getCompare(pattern, "=");
524  break;
525  case kGT:
526  ret = dict->getCompare(pattern, ">");
527  break;
528  case kGE:
529  ret = dict->getCompare(pattern, ">=");
530  break;
531  case kNE:
532  ret = dict->getCompare(pattern, "<>");
533  break;
534  default:
535  std::runtime_error("unsuported operator for string comparision");
536  }
537  return ret;
538 }
539 } // namespace
540 
541 llvm::Value* CodeGenerator::codegenDictStrCmp(const std::shared_ptr<Analyzer::Expr> lhs,
542  const std::shared_ptr<Analyzer::Expr> rhs,
543  const SQLOps compare_operator,
544  const CompilationOptions& co) {
546  auto rhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(rhs);
547  auto lhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(lhs);
548  auto rhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(rhs);
549  auto lhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(lhs);
550  std::shared_ptr<const Analyzer::UOper> cast_oper;
551  std::shared_ptr<const Analyzer::ColumnVar> col_var;
552  auto compare_opr = compare_operator;
553  if (lhs_col_var && rhs_col_var) {
554  if (lhs_col_var->get_type_info().get_comp_param() ==
555  rhs_col_var->get_type_info().get_comp_param()) {
556  if (compare_operator == kEQ || compare_operator == kNE) {
557  // TODO (vraj): implement compare between two dictionary encoded columns which
558  // share a dictionary
559  return nullptr;
560  }
561  }
562  // TODO (vraj): implement compare between two dictionary encoded columns which don't
563  // shared dictionary
564  throw std::runtime_error("Decoding two Dictionary encoded columns will be slow");
565  } else if (lhs_col_var && rhs_cast_oper) {
566  cast_oper.swap(rhs_cast_oper);
567  col_var.swap(lhs_col_var);
568  } else if (lhs_cast_oper && rhs_col_var) {
569  cast_oper.swap(lhs_cast_oper);
570  col_var.swap(rhs_col_var);
571  switch (compare_operator) {
572  case kLT:
573  compare_opr = kGT;
574  break;
575  case kLE:
576  compare_opr = kGE;
577  break;
578  case kGT:
579  compare_opr = kLT;
580  break;
581  case kGE:
582  compare_opr = kLE;
583  default:
584  break;
585  }
586  }
587  if (!cast_oper || !col_var) {
588  return nullptr;
589  }
590  CHECK_EQ(kCAST, cast_oper->get_optype());
591 
592  const auto const_expr =
593  dynamic_cast<Analyzer::Constant*>(cast_oper->get_own_operand().get());
594  if (!const_expr) {
595  // Analyzer casts dictionary encoded columns to none encoded if there is a comparison
596  // between two encoded columns. Which we currently do not handle.
597  return nullptr;
598  }
599  const auto& const_val = const_expr->get_constval();
600 
601  const auto col_ti = col_var->get_type_info();
602  CHECK(col_ti.is_string());
603  CHECK_EQ(kENCODING_DICT, col_ti.get_compression());
604  const auto sdp = executor()->getStringDictionaryProxy(
605  col_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
606 
607  if (sdp->storageEntryCount() > 200000000) {
608  std::runtime_error("Cardinality for string dictionary is too high");
609  return nullptr;
610  }
611 
612  const auto& pattern_str = *const_val.stringval;
613  const auto matching_ids = get_compared_ids(sdp, compare_opr, pattern_str);
614 
615  // InIntegerSet requires 64-bit values
616  std::vector<int64_t> matching_ids_64(matching_ids.size());
617  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
618 
619  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
620  col_var, matching_ids_64, col_ti.get_notnull());
621  return codegen(in_values.get(), co);
622 }
623 
625  const CompilationOptions& co) {
627  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
628  throw std::runtime_error("REGEXP not supported for unnested expressions");
629  }
630  char escape_char{'\\'};
631  if (expr->get_escape_expr()) {
632  auto escape_char_expr =
633  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
634  CHECK(escape_char_expr);
635  CHECK(escape_char_expr->get_type_info().is_string());
636  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
637  escape_char = (*escape_char_expr->get_constval().stringval)[0];
638  }
639  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_pattern_expr());
640  CHECK(pattern);
641  auto fast_dict_pattern_lv =
642  codegenDictRegexp(expr->get_own_arg(), pattern, escape_char, co);
643  if (fast_dict_pattern_lv) {
644  return fast_dict_pattern_lv;
645  }
646  const auto& ti = expr->get_arg()->get_type_info();
647  CHECK(ti.is_string());
648  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
649  throw WatchdogException(
650  "Cannot do REGEXP_LIKE on this dictionary encoded column, its cardinality is too "
651  "high");
652  }
653  // Now we know we are working on NONE ENCODED column. So switch back to CPU
655  throw QueryMustRunOnCpu();
656  }
657  auto str_lv = codegen(expr->get_arg(), true, co);
658  if (str_lv.size() != 3) {
659  CHECK_EQ(size_t(1), str_lv.size());
660  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
661  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
662  }
663  auto regexp_expr_arg_lvs = codegen(expr->get_pattern_expr(), true, co);
664  CHECK_EQ(size_t(3), regexp_expr_arg_lvs.size());
665  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
666  std::vector<llvm::Value*> regexp_args{
667  str_lv[1], str_lv[2], regexp_expr_arg_lvs[1], regexp_expr_arg_lvs[2]};
668  std::string fn_name("regexp_like");
669  regexp_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
670  if (is_nullable) {
671  fn_name += "_nullable";
672  regexp_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
673  return cgen_state_->emitExternalCall(
674  fn_name, get_int_type(8, cgen_state_->context_), regexp_args);
675  }
676  return cgen_state_->emitExternalCall(
677  fn_name, get_int_type(1, cgen_state_->context_), regexp_args);
678 }
679 
681  const std::shared_ptr<Analyzer::Expr> pattern_arg,
682  const Analyzer::Constant* pattern,
683  const char escape_char,
684  const CompilationOptions& co) {
686  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(pattern_arg);
687  if (!cast_oper) {
688  return nullptr;
689  }
690  CHECK(cast_oper);
691  CHECK_EQ(kCAST, cast_oper->get_optype());
692  const auto dict_regexp_arg = cast_oper->get_own_operand();
693  const auto& dict_regexp_arg_ti = dict_regexp_arg->get_type_info();
694  CHECK(dict_regexp_arg_ti.is_string());
695  CHECK_EQ(kENCODING_DICT, dict_regexp_arg_ti.get_compression());
696  const auto comp_param = dict_regexp_arg_ti.get_comp_param();
697  const auto sdp = executor()->getStringDictionaryProxy(
698  comp_param, executor()->getRowSetMemoryOwner(), true);
699  if (sdp->storageEntryCount() > 15000000) {
700  return nullptr;
701  }
702  if (sdp->getDictId() == TRANSIENT_DICT_ID) {
703  // If we have a literal dictionary it was a product
704  // of string ops applied to none-encoded strings, and
705  // will not be populated at codegen-time, so we
706  // cannot use the fast path
707 
708  // Todo(todd): Once string ops support non-string producting
709  // operators (like regexp_like), these operators can be chained
710  // and we can avoid the string translation
711  return nullptr;
712  }
713  const auto string_oper =
714  dynamic_cast<const Analyzer::StringOper*>(dict_regexp_arg.get());
715  if (string_oper) {
716  pre_translate_string_ops(string_oper, executor());
717  }
718  const auto& pattern_ti = pattern->get_type_info();
719  CHECK(pattern_ti.is_string());
720  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
721  const auto& pattern_datum = pattern->get_constval();
722  const auto& pattern_str = *pattern_datum.stringval;
723  const auto matching_ids = sdp->getRegexpLike(pattern_str, escape_char);
724  // InIntegerSet requires 64-bit values
725  std::vector<int64_t> matching_ids_64(matching_ids.size());
726  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
727  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
728  dict_regexp_arg, matching_ids_64, dict_regexp_arg_ti.get_notnull());
729  return codegen(in_values.get(), co);
730 }
std::string to_lower(const std::string &str)
#define CHECK_EQ(x, y)
Definition: Logger.h:230
std::pair< const char *, size_t > getStringBytes(int32_t string_id) const noexcept
llvm::Value * codegenPerRowStringOper(const Analyzer::StringOper *string_oper, const CompilationOptions &co)
bool hasNoneEncodedTextArg() const
Definition: Analyzer.h:1558
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:977
RUNTIME_EXPORT int32_t union_translate_string_id_to_other_dict(const int32_t string_id, const int64_t source_string_dict_handle, const int64_t dest_string_dict_handle)
Definition: StringOpsIR.cpp:96
std::vector< int32_t > get_compared_ids(const StringDictionaryProxy *dict, const SQLOps compare_operator, const std::string &pattern)
const Expr * get_escape_expr() const
Definition: Analyzer.h:907
#define DEF_APPLY_NUMERIC_STRING_OPS(value_type, value_name)
Definition: sqltypes.h:64
std::shared_ptr< Analyzer::Expr > remove_cast(const std::shared_ptr< Analyzer::Expr > &expr)
Definition: Analyzer.cpp:4255
ExecutorDeviceType
CgenState * cgen_state_
bool is_null
Definition: Datum.h:35
std::unique_ptr< StringDictionaryTranslationMgr > translate_dict_strings(const Analyzer::StringOper *expr, const ExecutorDeviceType device_type, Executor *executor)
void pre_translate_string_ops(const Analyzer::StringOper *string_oper, Executor *executor)
const Expr * get_escape_expr() const
Definition: Analyzer.h:979
llvm::Value * emitExternalCall(const std::string &fname, llvm::Type *ret_type, const std::vector< llvm::Value * > args, const std::vector< llvm::Attribute::AttrKind > &fnattrs={}, const bool has_struct_return=false)
Definition: CgenState.h:217
SQLOps
Definition: sqldefs.h:28
Definition: sqldefs.h:34
Definition: sqldefs.h:35
#define CHECK_GE(x, y)
Definition: Logger.h:235
Definition: sqldefs.h:48
llvm::Value * codegenPseudoStringOper(const Analyzer::ColumnVar *, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos, const CompilationOptions &)
Definition: sqldefs.h:29
llvm::Type * get_fp_type(const int width, llvm::LLVMContext &context)
const Expr * get_arg() const
Definition: Analyzer.h:976
size_t getArity() const
Definition: Analyzer.h:1523
const Analyzer::Expr * extract_cast_arg(const Analyzer::Expr *expr)
Definition: Execute.h:201
std::string getString(int32_t string_id) const
std::string toString(const QueryDescriptionType &type)
Definition: Types.h:64
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:379
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:234
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:182
const Expr * get_arg() const
Definition: Analyzer.h:904
int8_t * pointer
Definition: Datum.h:34
#define NULL_INT
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator) const
RUNTIME_EXPORT NEVER_INLINE int32_t extract_str_len_noinline(const uint64_t str_and_len)
llvm::LLVMContext & context_
Definition: CgenState.h:439
Classes representing a parse tree.
DEVICE auto copy(ARGS &&...args)
Definition: gpu_enabled.h:51
#define CHECK_NE(x, y)
Definition: Logger.h:231
bool g_enable_watchdog
bool get_is_simple() const
Definition: Analyzer.h:909
llvm::Value * codegenDictStrCmp(const std::shared_ptr< Analyzer::Expr >, const std::shared_ptr< Analyzer::Expr >, const SQLOps, const CompilationOptions &co)
llvm::Value * codegenDictRegexp(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const char escape_char, const CompilationOptions &)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:82
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
Definition: CgenState.cpp:219
std::string * stringval
Definition: Datum.h:54
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
std::vector< StringOps_Namespace::StringOpInfo > getStringOpInfos(const Analyzer::StringOper *expr)
Definition: sqldefs.h:33
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
const Expr * get_pattern_expr() const
Definition: Analyzer.h:978
RUNTIME_EXPORT NEVER_INLINE int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
Definition: sqltypes.h:68
Expression class for string functions The &quot;arg&quot; constructor parameter must be an expression that reso...
Definition: Analyzer.h:1464
#define TRANSIENT_DICT_ID
Definition: sqltypes.h:309
RUNTIME_EXPORT int32_t intersect_translate_string_id_to_other_dict(const int32_t string_id, const int64_t source_string_dict_handle, const int64_t dest_string_dict_handle)
Definition: StringOpsIR.cpp:80
RUNTIME_EXPORT int32_t apply_string_ops_and_encode(const char *str_ptr, const int32_t str_len, const int64_t string_ops_handle, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:64
const Expr * get_like_expr() const
Definition: Analyzer.h:906
Datum get_constval() const
Definition: Analyzer.h:343
Definition: sqldefs.h:31
const Expr * get_arg() const
Definition: Analyzer.h:711
RUNTIME_EXPORT int32_t string_compress(const int64_t ptr_and_len, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:52
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:388
const Expr * get_arg() const
Definition: Analyzer.h:760
llvm::ConstantInt * llInt(const T v) const
Definition: CgenState.h:306
#define CHECK(condition)
Definition: Logger.h:222
Definition: sqldefs.h:30
int32_t getOrAddTransient(const std::string &str)
bool g_cluster
Definition: sqldefs.h:32
const StringDictionaryTranslationMgr * moveStringDictionaryTranslationMgr(std::unique_ptr< const StringDictionaryTranslationMgr > &&str_dict_translation_mgr)
Definition: CgenState.h:196
Definition: sqltypes.h:60
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:905
bool is_unnest(const Analyzer::Expr *expr)
Definition: Execute.h:1464
__device__ uint64_t string_decode(int8_t *chunk_iter_, int64_t pos)
llvm::Value * codegen(llvm::Value *str_id_input, const SQLTypeInfo &input_ti, const bool add_nullcheck, const CompilationOptions &co) const
std::vector< std::shared_ptr< Analyzer::Expr > > getChainedStringOpExprs() const
Definition: Analyzer.h:1547
const Expr * getArg(const size_t i) const
Definition: Analyzer.h:1535
RUNTIME_EXPORT uint64_t string_decompress(const int32_t string_id, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:39
bool get_is_ilike() const
Definition: Analyzer.h:908
llvm::Value * codegenDictLike(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const bool ilike, const bool is_simple, const char escape_char, const CompilationOptions &)
size_t length
Definition: Datum.h:33
Executor * executor() const