OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOpsIR.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
19 
20 #include "../Shared/funcannotations.h"
21 #include "../Shared/sqldefs.h"
22 #include "Parser/ParserNode.h"
24 #include "StringOps/StringOps.h"
25 
26 #include <boost/locale/conversion.hpp>
27 
28 extern "C" RUNTIME_EXPORT uint64_t string_decode(int8_t* chunk_iter_, int64_t pos) {
29  auto chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
30  VarlenDatum vd;
31  bool is_end;
32  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
33  CHECK(!is_end);
34  return vd.is_null ? 0
35  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
36  (static_cast<uint64_t>(vd.length) << 48);
37 }
38 
39 extern "C" RUNTIME_EXPORT uint64_t string_decompress(const int32_t string_id,
40  const int64_t string_dict_handle) {
41  if (string_id == NULL_INT) {
42  return 0;
43  }
44  auto string_dict_proxy =
45  reinterpret_cast<const StringDictionaryProxy*>(string_dict_handle);
46  auto string_bytes = string_dict_proxy->getStringBytes(string_id);
47  CHECK(string_bytes.first);
48  return (reinterpret_cast<uint64_t>(string_bytes.first) & 0xffffffffffff) |
49  (static_cast<uint64_t>(string_bytes.second) << 48);
50 }
51 
52 extern "C" RUNTIME_EXPORT int32_t string_compress(const int64_t ptr_and_len,
53  const int64_t string_dict_handle) {
54  std::string raw_str(reinterpret_cast<char*>(extract_str_ptr_noinline(ptr_and_len)),
55  extract_str_len_noinline(ptr_and_len));
56  if (raw_str.empty()) {
57  return inline_int_null_value<int32_t>();
58  }
59  auto string_dict_proxy = reinterpret_cast<StringDictionaryProxy*>(string_dict_handle);
60  return string_dict_proxy->getOrAddTransient(raw_str);
61 }
62 
63 extern "C" RUNTIME_EXPORT int32_t
64 apply_string_ops_and_encode(const char* str_ptr,
65  const int32_t str_len,
66  const int64_t string_ops_handle,
67  const int64_t string_dict_handle) {
68  std::string raw_str(str_ptr, str_len);
69  auto string_ops =
70  reinterpret_cast<const StringOps_Namespace::StringOps*>(string_ops_handle);
71  auto string_dict_proxy = reinterpret_cast<StringDictionaryProxy*>(string_dict_handle);
72  const auto result_str = string_ops->operator()(raw_str);
73  if (result_str.empty()) {
74  return inline_int_null_value<int32_t>();
75  }
76  return string_dict_proxy->getOrAddTransient(result_str);
77 }
78 
79 extern "C" RUNTIME_EXPORT int32_t
81  const int64_t source_string_dict_handle,
82  const int64_t dest_string_dict_handle) {
83  const auto source_string_dict_proxy =
84  reinterpret_cast<StringDictionaryProxy*>(source_string_dict_handle);
85  auto dest_string_dict_proxy =
86  reinterpret_cast<StringDictionaryProxy*>(dest_string_dict_handle);
87  // Can we have StringDictionaryProxy::getString return a reference?
88  const auto source_str = source_string_dict_proxy->getString(string_id);
89  if (source_str.empty()) {
90  return inline_int_null_value<int32_t>();
91  }
92  return dest_string_dict_proxy->getIdOfString(source_str);
93 }
94 
95 extern "C" RUNTIME_EXPORT int32_t
97  const int64_t source_string_dict_handle,
98  const int64_t dest_string_dict_handle) {
99  const auto source_string_dict_proxy =
100  reinterpret_cast<StringDictionaryProxy*>(source_string_dict_handle);
101  auto dest_string_dict_proxy =
102  reinterpret_cast<StringDictionaryProxy*>(dest_string_dict_handle);
103  // Can we have StringDictionaryProxy::getString return a reference?
104  const auto source_str = source_string_dict_proxy->getString(string_id);
105  if (source_str.empty()) {
106  return inline_int_null_value<int32_t>();
107  }
108  return dest_string_dict_proxy->getOrAddTransient(source_str);
109 }
110 
112  const CompilationOptions& co) {
114  auto str_lv = codegen(expr->get_arg(), true, co);
115  if (str_lv.size() != 3) {
116  CHECK_EQ(size_t(1), str_lv.size());
117  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
118  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
119  if (co.device_type == ExecutorDeviceType::GPU) {
120  throw QueryMustRunOnCpu();
121  }
122  }
123  std::vector<llvm::Value*> charlength_args{str_lv[1], str_lv[2]};
124  std::string fn_name("char_length");
125  if (expr->get_calc_encoded_length()) {
126  fn_name += "_encoded";
127  }
128  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
129  if (is_nullable) {
130  fn_name += "_nullable";
131  charlength_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
132  }
133  return expr->get_calc_encoded_length()
134  ? cgen_state_->emitExternalCall(
135  fn_name, get_int_type(32, cgen_state_->context_), charlength_args)
136  : cgen_state_->emitCall(fn_name, charlength_args);
137 }
138 
140  const CompilationOptions& co) {
142  auto str_lv = codegen(expr->get_arg(), true, co);
143  CHECK_EQ(size_t(1), str_lv.size());
144  return cgen_state_->emitCall("key_for_string_encoded", str_lv);
145 }
146 
147 std::vector<StringOps_Namespace::StringOpInfo> getStringOpInfos(
148  const Analyzer::StringOper* expr) {
149  std::vector<StringOps_Namespace::StringOpInfo> string_op_infos;
150  auto chained_string_op_exprs = expr->getChainedStringOpExprs();
151  if (chained_string_op_exprs.empty()) {
152  // Likely will change the below to a CHECK but until we have more confidence
153  // that all potential query patterns have nodes that might contain string ops folded,
154  // leaving as an error for now
155  throw std::runtime_error(
156  "Expected folded string operator but found operator unfolded.");
157  }
158  // Consider encapsulating below in an Analyzer::StringOper method to dedup
159  for (const auto& chained_string_op_expr : chained_string_op_exprs) {
160  auto chained_string_op =
161  dynamic_cast<const Analyzer::StringOper*>(chained_string_op_expr.get());
162  CHECK(chained_string_op);
163  StringOps_Namespace::StringOpInfo string_op_info(chained_string_op->get_kind(),
164  chained_string_op->getLiteralArgs());
165  string_op_infos.emplace_back(string_op_info);
166  }
167  return string_op_infos;
168 }
169 
171  const CompilationOptions& co) {
173  CHECK_GE(expr->getArity(), 1UL);
174 
175  const auto& expr_ti = expr->get_type_info();
176  // Should probably CHECK we have a UOper cast to dict encoded to be consistent
177  const auto primary_arg = remove_cast(expr->getArg(0));
178  CHECK(primary_arg->get_type_info().is_none_encoded_string());
179 
180  if (g_cluster) {
181  throw std::runtime_error(
182  "Cast from none-encoded string to dictionary-encoded not supported for "
183  "distributed queries");
184  }
186  throw QueryMustRunOnCpu();
187  }
188  auto primary_str_lv = codegen(primary_arg, true, co);
189  CHECK_EQ(size_t(3), primary_str_lv.size());
190  const auto string_op_infos = getStringOpInfos(expr);
191  CHECK(string_op_infos.size());
192 
193  const auto string_ops =
194  executor()->getRowSetMemoryOwner()->getStringOps(string_op_infos);
195  const int64_t string_ops_handle = reinterpret_cast<int64_t>(string_ops);
196  auto string_ops_handle_lv = cgen_state_->llInt(string_ops_handle);
197 
198  const int64_t dest_string_proxy_handle =
199  reinterpret_cast<int64_t>(executor()->getStringDictionaryProxy(
200  expr_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true));
201  auto dest_string_proxy_handle_lv = cgen_state_->llInt(dest_string_proxy_handle);
202  std::vector<llvm::Value*> string_oper_lvs{primary_str_lv[1],
203  primary_str_lv[2],
204  string_ops_handle_lv,
205  dest_string_proxy_handle_lv};
206 
207  return cgen_state_->emitExternalCall("apply_string_ops_and_encode",
209  string_oper_lvs);
210 }
211 
212 std::unique_ptr<StringDictionaryTranslationMgr> translate_dict_strings(
213  const Analyzer::StringOper* expr,
214  const ExecutorDeviceType device_type,
215  Executor* executor) {
216  const auto& expr_ti = expr->get_type_info();
217  const auto dict_id = expr_ti.get_comp_param();
218  const auto string_op_infos = getStringOpInfos(expr);
219  CHECK(string_op_infos.size());
220 
221  auto string_dictionary_translation_mgr =
222  std::make_unique<StringDictionaryTranslationMgr>(
223  dict_id,
224  dict_id,
225  false, // translate_intersection_only
226  string_op_infos,
229  executor->deviceCount(device_type),
230  executor,
231  &executor->getCatalog()->getDataMgr(),
232  false /* delay_translation */);
233  return string_dictionary_translation_mgr;
234 }
235 
237  const CompilationOptions& co) {
238  CHECK_GE(expr->getArity(), 1UL);
239  if (expr->hasNoneEncodedTextArg()) {
240  return codegenPerRowStringOper(expr, co);
241  }
243 
244  const auto& expr_ti = expr->get_type_info();
245  auto string_dictionary_translation_mgr =
247 
248  auto str_id_lv = codegen(expr->getArg(0), true, co);
249  CHECK_EQ(size_t(1), str_id_lv.size());
250 
251  return cgen_state_
252  ->moveStringDictionaryTranslationMgr(std::move(string_dictionary_translation_mgr))
253  ->codegen(str_id_lv[0], expr_ti, true /* add_nullcheck */, co);
254 }
255 
256 // Method below is for join probes, as we cast the StringOper nodes to ColumnVars early to
257 // not special case that codepath (but retain the StringOpInfos, which we use here to
258 // execute the same string ops as we would on a native StringOper node)
260  const Analyzer::ColumnVar* expr,
261  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos,
262  const CompilationOptions& co) {
264  const auto& expr_ti = expr->get_type_info();
265  const auto dict_id = expr_ti.get_comp_param();
266 
267  auto string_dictionary_translation_mgr =
268  std::make_unique<StringDictionaryTranslationMgr>(
269  dict_id,
270  dict_id,
271  false, // translate_intersection_only
272  string_op_infos,
275  executor()->deviceCount(co.device_type),
276  executor(),
277  &executor()->getCatalog()->getDataMgr(),
278  false /* delay_translation */);
279 
280  auto str_id_lv = codegen(expr, true /* fetch_column */, co);
281  CHECK_EQ(size_t(1), str_id_lv.size());
282 
283  return cgen_state_
284  ->moveStringDictionaryTranslationMgr(std::move(string_dictionary_translation_mgr))
285  ->codegen(str_id_lv[0], expr_ti, true /* add_nullcheck */, co);
286 }
287 
289  const CompilationOptions& co) {
291  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
292  throw std::runtime_error("LIKE not supported for unnested expressions");
293  }
294  char escape_char{'\\'};
295  if (expr->get_escape_expr()) {
296  auto escape_char_expr =
297  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
298  CHECK(escape_char_expr);
299  CHECK(escape_char_expr->get_type_info().is_string());
300  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
301  escape_char = (*escape_char_expr->get_constval().stringval)[0];
302  }
303  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_like_expr());
304  CHECK(pattern);
305  auto fast_dict_like_lv = codegenDictLike(expr->get_own_arg(),
306  pattern,
307  expr->get_is_ilike(),
308  expr->get_is_simple(),
309  escape_char,
310  co);
311  if (fast_dict_like_lv) {
312  return fast_dict_like_lv;
313  }
314  const auto& ti = expr->get_arg()->get_type_info();
315  CHECK(ti.is_string());
316  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
317  throw WatchdogException(
318  "Cannot do LIKE / ILIKE on this dictionary encoded column, its cardinality is "
319  "too high");
320  }
321  auto str_lv = codegen(expr->get_arg(), true, co);
322  if (str_lv.size() != 3) {
323  CHECK_EQ(size_t(1), str_lv.size());
324  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
325  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
326  if (co.device_type == ExecutorDeviceType::GPU) {
327  throw QueryMustRunOnCpu();
328  }
329  }
330  auto like_expr_arg_lvs = codegen(expr->get_like_expr(), true, co);
331  CHECK_EQ(size_t(3), like_expr_arg_lvs.size());
332  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
333  std::vector<llvm::Value*> str_like_args{
334  str_lv[1], str_lv[2], like_expr_arg_lvs[1], like_expr_arg_lvs[2]};
335  std::string fn_name{expr->get_is_ilike() ? "string_ilike" : "string_like"};
336  if (expr->get_is_simple()) {
337  fn_name += "_simple";
338  } else {
339  str_like_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
340  }
341  if (is_nullable) {
342  fn_name += "_nullable";
343  str_like_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
344  }
345  return cgen_state_->emitCall(fn_name, str_like_args);
346 }
347 
349  Executor* executor) {
350  // If here we are operating on top of one or more string functions, i.e. LOWER(str),
351  // and before running the dictionary LIKE/ILIKE or REGEXP_LIKE,
352  // we need to translate the strings first.
353 
354  // This approach is a temporary solution until we can implement the next stage
355  // of the string translation project, which will broaden the StringOper class to include
356  // operations that operate on strings but do not neccessarily return strings like
357  // LIKE/ILIKE/REGEXP_LIKE/CHAR_LENGTH At this point these aforementioned operators,
358  // including LIKE/ILIKE, will just become part of a StringOps chain (which will also
359  // avoid the overhead of serializing the transformed raw strings from previous string
360  // opers to the dictionary to only read back out and perform LIKE/ILIKE.)
361  CHECK_GT(string_oper->getArity(), 0UL);
362  const auto& string_oper_primary_arg_ti = string_oper->getArg(0)->get_type_info();
363  CHECK(string_oper_primary_arg_ti.is_dict_encoded_string());
364  CHECK_NE(string_oper_primary_arg_ti.get_comp_param(), TRANSIENT_DICT_ID);
365  // Note the actual translation below will be cached by RowSetMemOwner
366  translate_dict_strings(string_oper, ExecutorDeviceType::CPU, executor);
367 }
368 
370  const std::shared_ptr<Analyzer::Expr> like_arg,
371  const Analyzer::Constant* pattern,
372  const bool ilike,
373  const bool is_simple,
374  const char escape_char,
375  const CompilationOptions& co) {
377  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(like_arg);
378  if (!cast_oper) {
379  return nullptr;
380  }
381  CHECK(cast_oper);
382  CHECK_EQ(kCAST, cast_oper->get_optype());
383  const auto dict_like_arg = cast_oper->get_own_operand();
384  const auto& dict_like_arg_ti = dict_like_arg->get_type_info();
385  if (!dict_like_arg_ti.is_string()) {
386  throw(std::runtime_error("Cast from " + dict_like_arg_ti.get_type_name() + " to " +
387  cast_oper->get_type_info().get_type_name() +
388  " not supported"));
389  }
390  CHECK_EQ(kENCODING_DICT, dict_like_arg_ti.get_compression());
391  const auto sdp = executor()->getStringDictionaryProxy(
392  dict_like_arg_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
393  if (sdp->storageEntryCount() > 200000000) {
394  return nullptr;
395  }
396  if (sdp->getDictId() == TRANSIENT_DICT_ID) {
397  // If we have a literal dictionary it was a product
398  // of string ops applied to none-encoded strings, and
399  // will not be populated at codegen-time, so we
400  // cannot use the fast path
401 
402  // Todo(todd): Once string ops support non-string producting
403  // operators (like like/ilike), like/ilike can be chained and
404  // we can avoid the string translation
405  return nullptr;
406  }
407  const auto string_oper = dynamic_cast<const Analyzer::StringOper*>(dict_like_arg.get());
408  if (string_oper) {
409  pre_translate_string_ops(string_oper, executor());
410  }
411  const auto& pattern_ti = pattern->get_type_info();
412  CHECK(pattern_ti.is_string());
413  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
414  const auto& pattern_datum = pattern->get_constval();
415  const auto& pattern_str = *pattern_datum.stringval;
416  const auto matching_ids = sdp->getLike(pattern_str, ilike, is_simple, escape_char);
417  // InIntegerSet requires 64-bit values
418  std::vector<int64_t> matching_ids_64(matching_ids.size());
419  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
420  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
421  dict_like_arg, matching_ids_64, dict_like_arg_ti.get_notnull());
422  return codegen(in_values.get(), co);
423 }
424 
425 namespace {
426 
427 std::vector<int32_t> get_compared_ids(const StringDictionaryProxy* dict,
428  const SQLOps compare_operator,
429  const std::string& pattern) {
430  std::vector<int> ret;
431  switch (compare_operator) {
432  case kLT:
433  ret = dict->getCompare(pattern, "<");
434  break;
435  case kLE:
436  ret = dict->getCompare(pattern, "<=");
437  break;
438  case kEQ:
439  case kBW_EQ:
440  ret = dict->getCompare(pattern, "=");
441  break;
442  case kGT:
443  ret = dict->getCompare(pattern, ">");
444  break;
445  case kGE:
446  ret = dict->getCompare(pattern, ">=");
447  break;
448  case kNE:
449  ret = dict->getCompare(pattern, "<>");
450  break;
451  default:
452  std::runtime_error("unsuported operator for string comparision");
453  }
454  return ret;
455 }
456 } // namespace
457 
458 llvm::Value* CodeGenerator::codegenDictStrCmp(const std::shared_ptr<Analyzer::Expr> lhs,
459  const std::shared_ptr<Analyzer::Expr> rhs,
460  const SQLOps compare_operator,
461  const CompilationOptions& co) {
463  auto rhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(rhs);
464  auto lhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(lhs);
465  auto rhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(rhs);
466  auto lhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(lhs);
467  std::shared_ptr<const Analyzer::UOper> cast_oper;
468  std::shared_ptr<const Analyzer::ColumnVar> col_var;
469  auto compare_opr = compare_operator;
470  if (lhs_col_var && rhs_col_var) {
471  if (lhs_col_var->get_type_info().get_comp_param() ==
472  rhs_col_var->get_type_info().get_comp_param()) {
473  if (compare_operator == kEQ || compare_operator == kNE) {
474  // TODO (vraj): implement compare between two dictionary encoded columns which
475  // share a dictionary
476  return nullptr;
477  }
478  }
479  // TODO (vraj): implement compare between two dictionary encoded columns which don't
480  // shared dictionary
481  throw std::runtime_error("Decoding two Dictionary encoded columns will be slow");
482  } else if (lhs_col_var && rhs_cast_oper) {
483  cast_oper.swap(rhs_cast_oper);
484  col_var.swap(lhs_col_var);
485  } else if (lhs_cast_oper && rhs_col_var) {
486  cast_oper.swap(lhs_cast_oper);
487  col_var.swap(rhs_col_var);
488  switch (compare_operator) {
489  case kLT:
490  compare_opr = kGT;
491  break;
492  case kLE:
493  compare_opr = kGE;
494  break;
495  case kGT:
496  compare_opr = kLT;
497  break;
498  case kGE:
499  compare_opr = kLE;
500  default:
501  break;
502  }
503  }
504  if (!cast_oper || !col_var) {
505  return nullptr;
506  }
507  CHECK_EQ(kCAST, cast_oper->get_optype());
508 
509  const auto const_expr =
510  dynamic_cast<Analyzer::Constant*>(cast_oper->get_own_operand().get());
511  if (!const_expr) {
512  // Analyzer casts dictionary encoded columns to none encoded if there is a comparison
513  // between two encoded columns. Which we currently do not handle.
514  return nullptr;
515  }
516  const auto& const_val = const_expr->get_constval();
517 
518  const auto col_ti = col_var->get_type_info();
519  CHECK(col_ti.is_string());
520  CHECK_EQ(kENCODING_DICT, col_ti.get_compression());
521  const auto sdp = executor()->getStringDictionaryProxy(
522  col_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
523 
524  if (sdp->storageEntryCount() > 200000000) {
525  std::runtime_error("Cardinality for string dictionary is too high");
526  return nullptr;
527  }
528 
529  const auto& pattern_str = *const_val.stringval;
530  const auto matching_ids = get_compared_ids(sdp, compare_opr, pattern_str);
531 
532  // InIntegerSet requires 64-bit values
533  std::vector<int64_t> matching_ids_64(matching_ids.size());
534  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
535 
536  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
537  col_var, matching_ids_64, col_ti.get_notnull());
538  return codegen(in_values.get(), co);
539 }
540 
542  const CompilationOptions& co) {
544  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
545  throw std::runtime_error("REGEXP not supported for unnested expressions");
546  }
547  char escape_char{'\\'};
548  if (expr->get_escape_expr()) {
549  auto escape_char_expr =
550  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
551  CHECK(escape_char_expr);
552  CHECK(escape_char_expr->get_type_info().is_string());
553  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
554  escape_char = (*escape_char_expr->get_constval().stringval)[0];
555  }
556  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_pattern_expr());
557  CHECK(pattern);
558  auto fast_dict_pattern_lv =
559  codegenDictRegexp(expr->get_own_arg(), pattern, escape_char, co);
560  if (fast_dict_pattern_lv) {
561  return fast_dict_pattern_lv;
562  }
563  const auto& ti = expr->get_arg()->get_type_info();
564  CHECK(ti.is_string());
565  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
566  throw WatchdogException(
567  "Cannot do REGEXP_LIKE on this dictionary encoded column, its cardinality is too "
568  "high");
569  }
570  // Now we know we are working on NONE ENCODED column. So switch back to CPU
572  throw QueryMustRunOnCpu();
573  }
574  auto str_lv = codegen(expr->get_arg(), true, co);
575  if (str_lv.size() != 3) {
576  CHECK_EQ(size_t(1), str_lv.size());
577  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
578  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
579  }
580  auto regexp_expr_arg_lvs = codegen(expr->get_pattern_expr(), true, co);
581  CHECK_EQ(size_t(3), regexp_expr_arg_lvs.size());
582  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
583  std::vector<llvm::Value*> regexp_args{
584  str_lv[1], str_lv[2], regexp_expr_arg_lvs[1], regexp_expr_arg_lvs[2]};
585  std::string fn_name("regexp_like");
586  regexp_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
587  if (is_nullable) {
588  fn_name += "_nullable";
589  regexp_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
590  return cgen_state_->emitExternalCall(
591  fn_name, get_int_type(8, cgen_state_->context_), regexp_args);
592  }
593  return cgen_state_->emitExternalCall(
594  fn_name, get_int_type(1, cgen_state_->context_), regexp_args);
595 }
596 
598  const std::shared_ptr<Analyzer::Expr> pattern_arg,
599  const Analyzer::Constant* pattern,
600  const char escape_char,
601  const CompilationOptions& co) {
603  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(pattern_arg);
604  if (!cast_oper) {
605  return nullptr;
606  }
607  CHECK(cast_oper);
608  CHECK_EQ(kCAST, cast_oper->get_optype());
609  const auto dict_regexp_arg = cast_oper->get_own_operand();
610  const auto& dict_regexp_arg_ti = dict_regexp_arg->get_type_info();
611  CHECK(dict_regexp_arg_ti.is_string());
612  CHECK_EQ(kENCODING_DICT, dict_regexp_arg_ti.get_compression());
613  const auto comp_param = dict_regexp_arg_ti.get_comp_param();
614  const auto sdp = executor()->getStringDictionaryProxy(
615  comp_param, executor()->getRowSetMemoryOwner(), true);
616  if (sdp->storageEntryCount() > 15000000) {
617  return nullptr;
618  }
619  if (sdp->getDictId() == TRANSIENT_DICT_ID) {
620  // If we have a literal dictionary it was a product
621  // of string ops applied to none-encoded strings, and
622  // will not be populated at codegen-time, so we
623  // cannot use the fast path
624 
625  // Todo(todd): Once string ops support non-string producting
626  // operators (like regexp_like), these operators can be chained
627  // and we can avoid the string translation
628  return nullptr;
629  }
630  const auto string_oper =
631  dynamic_cast<const Analyzer::StringOper*>(dict_regexp_arg.get());
632  if (string_oper) {
633  pre_translate_string_ops(string_oper, executor());
634  }
635  const auto& pattern_ti = pattern->get_type_info();
636  CHECK(pattern_ti.is_string());
637  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
638  const auto& pattern_datum = pattern->get_constval();
639  const auto& pattern_str = *pattern_datum.stringval;
640  const auto matching_ids = sdp->getRegexpLike(pattern_str, escape_char);
641  // InIntegerSet requires 64-bit values
642  std::vector<int64_t> matching_ids_64(matching_ids.size());
643  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
644  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
645  dict_regexp_arg, matching_ids_64, dict_regexp_arg_ti.get_notnull());
646  return codegen(in_values.get(), co);
647 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
std::pair< const char *, size_t > getStringBytes(int32_t string_id) const noexcept
llvm::Value * codegenPerRowStringOper(const Analyzer::StringOper *string_oper, const CompilationOptions &co)
bool hasNoneEncodedTextArg() const
Definition: Analyzer.h:1541
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:976
RUNTIME_EXPORT int32_t union_translate_string_id_to_other_dict(const int32_t string_id, const int64_t source_string_dict_handle, const int64_t dest_string_dict_handle)
Definition: StringOpsIR.cpp:96
std::vector< int32_t > get_compared_ids(const StringDictionaryProxy *dict, const SQLOps compare_operator, const std::string &pattern)
const Expr * get_escape_expr() const
Definition: Analyzer.h:906
std::shared_ptr< Analyzer::Expr > remove_cast(const std::shared_ptr< Analyzer::Expr > &expr)
Definition: Analyzer.cpp:4196
ExecutorDeviceType
CgenState * cgen_state_
bool is_null
Definition: sqltypes.h:153
std::unique_ptr< StringDictionaryTranslationMgr > translate_dict_strings(const Analyzer::StringOper *expr, const ExecutorDeviceType device_type, Executor *executor)
void pre_translate_string_ops(const Analyzer::StringOper *string_oper, Executor *executor)
const Expr * get_escape_expr() const
Definition: Analyzer.h:978
llvm::Value * emitExternalCall(const std::string &fname, llvm::Type *ret_type, const std::vector< llvm::Value * > args, const std::vector< llvm::Attribute::AttrKind > &fnattrs={}, const bool has_struct_return=false)
Definition: CgenState.h:217
SQLOps
Definition: sqldefs.h:28
Definition: sqldefs.h:34
Definition: sqldefs.h:35
#define CHECK_GE(x, y)
Definition: Logger.h:235
Definition: sqldefs.h:48
llvm::Value * codegenPseudoStringOper(const Analyzer::ColumnVar *, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos, const CompilationOptions &)
Definition: sqldefs.h:29
const Expr * get_arg() const
Definition: Analyzer.h:975
size_t getArity() const
Definition: Analyzer.h:1506
const Analyzer::Expr * extract_cast_arg(const Analyzer::Expr *expr)
Definition: Execute.h:201
std::string getString(int32_t string_id) const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:234
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:182
const Expr * get_arg() const
Definition: Analyzer.h:903
int8_t * pointer
Definition: sqltypes.h:152
#define NULL_INT
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator) const
RUNTIME_EXPORT NEVER_INLINE int32_t extract_str_len_noinline(const uint64_t str_and_len)
llvm::LLVMContext & context_
Definition: CgenState.h:359
Classes representing a parse tree.
DEVICE auto copy(ARGS &&...args)
Definition: gpu_enabled.h:51
#define CHECK_NE(x, y)
Definition: Logger.h:231
bool g_enable_watchdog
bool get_is_simple() const
Definition: Analyzer.h:908
llvm::Value * codegenDictStrCmp(const std::shared_ptr< Analyzer::Expr >, const std::shared_ptr< Analyzer::Expr >, const SQLOps, const CompilationOptions &co)
llvm::Value * codegenDictRegexp(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const char escape_char, const CompilationOptions &)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:81
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
Definition: CgenState.cpp:215
std::string * stringval
Definition: sqltypes.h:220
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
std::vector< StringOps_Namespace::StringOpInfo > getStringOpInfos(const Analyzer::StringOper *expr)
Definition: sqldefs.h:33
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
const Expr * get_pattern_expr() const
Definition: Analyzer.h:977
RUNTIME_EXPORT NEVER_INLINE int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
Expression class for string functions The &quot;arg&quot; constructor parameter must be an expression that reso...
Definition: Analyzer.h:1463
#define TRANSIENT_DICT_ID
Definition: sqltypes.h:259
RUNTIME_EXPORT int32_t intersect_translate_string_id_to_other_dict(const int32_t string_id, const int64_t source_string_dict_handle, const int64_t dest_string_dict_handle)
Definition: StringOpsIR.cpp:80
RUNTIME_EXPORT int32_t apply_string_ops_and_encode(const char *str_ptr, const int32_t str_len, const int64_t string_ops_handle, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:64
const Expr * get_like_expr() const
Definition: Analyzer.h:905
Datum get_constval() const
Definition: Analyzer.h:342
Definition: sqldefs.h:31
const Expr * get_arg() const
Definition: Analyzer.h:710
RUNTIME_EXPORT int32_t string_compress(const int64_t ptr_and_len, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:52
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:338
const Expr * get_arg() const
Definition: Analyzer.h:759
llvm::ConstantInt * llInt(const T v) const
Definition: CgenState.h:296
#define CHECK(condition)
Definition: Logger.h:222
Definition: sqldefs.h:30
int32_t getOrAddTransient(const std::string &str)
bool g_cluster
Definition: sqldefs.h:32
const StringDictionaryTranslationMgr * moveStringDictionaryTranslationMgr(std::unique_ptr< const StringDictionaryTranslationMgr > &&str_dict_translation_mgr)
Definition: CgenState.h:196
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:904
bool is_unnest(const Analyzer::Expr *expr)
Definition: Execute.h:1422
__device__ uint64_t string_decode(int8_t *chunk_iter_, int64_t pos)
llvm::Value * codegen(llvm::Value *str_id_input, const SQLTypeInfo &input_ti, const bool add_nullcheck, const CompilationOptions &co) const
std::vector< std::shared_ptr< Analyzer::Expr > > getChainedStringOpExprs() const
Definition: Analyzer.h:1530
const Expr * getArg(const size_t i) const
Definition: Analyzer.h:1518
RUNTIME_EXPORT uint64_t string_decompress(const int32_t string_id, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:39
bool get_is_ilike() const
Definition: Analyzer.h:907
llvm::Value * codegenDictLike(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const bool ilike, const bool is_simple, const char escape_char, const CompilationOptions &)
size_t length
Definition: sqltypes.h:151
Executor * executor() const