OmniSciDB  04ee39c94c
StringOpsIR.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
19 
20 #include "../Shared/sqldefs.h"
21 #include "Parser/ParserNode.h"
22 
23 extern "C" uint64_t string_decode(int8_t* chunk_iter_, int64_t pos) {
24  auto chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
25  VarlenDatum vd;
26  bool is_end;
27  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
28  CHECK(!is_end);
29  return vd.is_null ? 0
30  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
31  (static_cast<uint64_t>(vd.length) << 48);
32 }
33 
34 extern "C" uint64_t string_decompress(const int32_t string_id,
35  const int64_t string_dict_handle) {
36  if (string_id == NULL_INT) {
37  return 0;
38  }
39  auto string_dict_proxy =
40  reinterpret_cast<const StringDictionaryProxy*>(string_dict_handle);
41  auto string_bytes = string_dict_proxy->getStringBytes(string_id);
42  CHECK(string_bytes.first);
43  return (reinterpret_cast<uint64_t>(string_bytes.first) & 0xffffffffffff) |
44  (static_cast<uint64_t>(string_bytes.second) << 48);
45 }
46 
47 extern "C" int32_t string_compress(const int64_t ptr_and_len,
48  const int64_t string_dict_handle) {
49  std::string raw_str(reinterpret_cast<char*>(extract_str_ptr_noinline(ptr_and_len)),
50  extract_str_len_noinline(ptr_and_len));
51  auto string_dict_proxy =
52  reinterpret_cast<const StringDictionaryProxy*>(string_dict_handle);
53  return string_dict_proxy->getIdOfString(raw_str);
54 }
55 
57  const CompilationOptions& co) {
58  auto str_lv = codegen(expr->get_arg(), true, co);
59  if (str_lv.size() != 3) {
60  CHECK_EQ(size_t(1), str_lv.size());
61  if (g_enable_watchdog) {
62  throw WatchdogException(
63  "LENGTH / CHAR_LENGTH on dictionary-encoded strings would be slow");
64  }
65  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
66  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
68  throw QueryMustRunOnCpu();
69  }
70  }
71  std::vector<llvm::Value*> charlength_args{str_lv[1], str_lv[2]};
72  std::string fn_name("char_length");
73  if (expr->get_calc_encoded_length()) {
74  fn_name += "_encoded";
75  }
76  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
77  if (is_nullable) {
78  fn_name += "_nullable";
79  charlength_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
80  }
81  return expr->get_calc_encoded_length()
83  fn_name, get_int_type(32, cgen_state_->context_), charlength_args)
84  : cgen_state_->emitCall(fn_name, charlength_args);
85 }
86 
88  const CompilationOptions& co) {
89  auto str_lv = codegen(expr->get_arg(), true, co);
90  CHECK_EQ(size_t(1), str_lv.size());
91  return cgen_state_->emitCall("key_for_string_encoded", str_lv);
92 }
93 
94 llvm::Value* CodeGenerator::codegen(const Analyzer::LikeExpr* expr,
95  const CompilationOptions& co) {
96  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
97  throw std::runtime_error("LIKE not supported for unnested expressions");
98  }
99  char escape_char{'\\'};
100  if (expr->get_escape_expr()) {
101  auto escape_char_expr =
102  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
103  CHECK(escape_char_expr);
104  CHECK(escape_char_expr->get_type_info().is_string());
105  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
106  escape_char = (*escape_char_expr->get_constval().stringval)[0];
107  }
108  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_like_expr());
109  CHECK(pattern);
110  auto fast_dict_like_lv = codegenDictLike(expr->get_own_arg(),
111  pattern,
112  expr->get_is_ilike(),
113  expr->get_is_simple(),
114  escape_char,
115  co);
116  if (fast_dict_like_lv) {
117  return fast_dict_like_lv;
118  }
119  const auto& ti = expr->get_arg()->get_type_info();
120  CHECK(ti.is_string());
121  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
122  throw WatchdogException(
123  "Cannot do LIKE / ILIKE on this dictionary encoded column, its cardinality is "
124  "too high");
125  }
126  auto str_lv = codegen(expr->get_arg(), true, co);
127  if (str_lv.size() != 3) {
128  CHECK_EQ(size_t(1), str_lv.size());
129  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
130  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
132  throw QueryMustRunOnCpu();
133  }
134  }
135  auto like_expr_arg_lvs = codegen(expr->get_like_expr(), true, co);
136  CHECK_EQ(size_t(3), like_expr_arg_lvs.size());
137  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
138  std::vector<llvm::Value*> str_like_args{
139  str_lv[1], str_lv[2], like_expr_arg_lvs[1], like_expr_arg_lvs[2]};
140  std::string fn_name{expr->get_is_ilike() ? "string_ilike" : "string_like"};
141  if (expr->get_is_simple()) {
142  fn_name += "_simple";
143  } else {
144  str_like_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
145  }
146  if (is_nullable) {
147  fn_name += "_nullable";
148  str_like_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
149  }
150  return cgen_state_->emitCall(fn_name, str_like_args);
151 }
152 
154  const std::shared_ptr<Analyzer::Expr> like_arg,
155  const Analyzer::Constant* pattern,
156  const bool ilike,
157  const bool is_simple,
158  const char escape_char,
159  const CompilationOptions& co) {
160  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(like_arg);
161  if (!cast_oper) {
162  return nullptr;
163  }
164  CHECK(cast_oper);
165  CHECK_EQ(kCAST, cast_oper->get_optype());
166  const auto dict_like_arg = cast_oper->get_own_operand();
167  const auto& dict_like_arg_ti = dict_like_arg->get_type_info();
168  if (!dict_like_arg_ti.is_string()) {
169  throw(std::runtime_error("Cast from " + dict_like_arg_ti.get_type_name() + " to " +
170  cast_oper->get_type_info().get_type_name() +
171  " not supported"));
172  }
173  CHECK_EQ(kENCODING_DICT, dict_like_arg_ti.get_compression());
174  const auto sdp = executor()->getStringDictionaryProxy(
175  dict_like_arg_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
176  if (sdp->storageEntryCount() > 200000000) {
177  return nullptr;
178  }
179  const auto& pattern_ti = pattern->get_type_info();
180  CHECK(pattern_ti.is_string());
181  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
182  const auto& pattern_datum = pattern->get_constval();
183  const auto& pattern_str = *pattern_datum.stringval;
184  const auto matching_ids = sdp->getLike(pattern_str, ilike, is_simple, escape_char);
185  // InIntegerSet requires 64-bit values
186  std::vector<int64_t> matching_ids_64(matching_ids.size());
187  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
188  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
189  dict_like_arg, matching_ids_64, dict_like_arg_ti.get_notnull());
190  return codegen(in_values.get(), co);
191 }
192 
193 namespace {
194 
195 std::vector<int32_t> get_compared_ids(const StringDictionaryProxy* dict,
196  const SQLOps compare_operator,
197  const std::string& pattern) {
198  std::vector<int> ret;
199  switch (compare_operator) {
200  case kLT:
201  ret = dict->getCompare(pattern, "<");
202  break;
203  case kLE:
204  ret = dict->getCompare(pattern, "<=");
205  break;
206  case kEQ:
207  case kBW_EQ:
208  ret = dict->getCompare(pattern, "=");
209  break;
210  case kGT:
211  ret = dict->getCompare(pattern, ">");
212  break;
213  case kGE:
214  ret = dict->getCompare(pattern, ">=");
215  break;
216  case kNE:
217  ret = dict->getCompare(pattern, "<>");
218  break;
219  default:
220  std::runtime_error("unsuported operator for string comparision");
221  }
222  return ret;
223 }
224 } // namespace
225 
226 llvm::Value* CodeGenerator::codegenDictStrCmp(const std::shared_ptr<Analyzer::Expr> lhs,
227  const std::shared_ptr<Analyzer::Expr> rhs,
228  const SQLOps compare_operator,
229  const CompilationOptions& co) {
230  auto rhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(rhs);
231  auto lhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(lhs);
232  auto rhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(rhs);
233  auto lhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(lhs);
234  std::shared_ptr<const Analyzer::UOper> cast_oper;
235  std::shared_ptr<const Analyzer::ColumnVar> col_var;
236  auto compare_opr = compare_operator;
237  if (lhs_col_var && rhs_col_var) {
238  if (lhs_col_var->get_type_info().get_comp_param() ==
239  rhs_col_var->get_type_info().get_comp_param()) {
240  if (compare_operator == kEQ || compare_operator == kNE) {
241  // TODO (vraj): implement compare between two dictionary encoded columns which
242  // share a dictionary
243  return nullptr;
244  }
245  }
246  // TODO (vraj): implement compare between two dictionary encoded columns which don't
247  // shared dictionary
248  throw std::runtime_error("Decoding two Dictionary encoded columns will be slow");
249  } else if (lhs_col_var && rhs_cast_oper) {
250  cast_oper.swap(rhs_cast_oper);
251  col_var.swap(lhs_col_var);
252  } else if (lhs_cast_oper && rhs_col_var) {
253  cast_oper.swap(lhs_cast_oper);
254  col_var.swap(rhs_col_var);
255  switch (compare_operator) {
256  case kLT:
257  compare_opr = kGT;
258  break;
259  case kLE:
260  compare_opr = kGE;
261  break;
262  case kGT:
263  compare_opr = kLT;
264  break;
265  case kGE:
266  compare_opr = kLE;
267  default:
268  break;
269  }
270  }
271  if (!cast_oper || !col_var) {
272  return nullptr;
273  }
274  CHECK_EQ(kCAST, cast_oper->get_optype());
275 
276  const auto const_expr =
277  dynamic_cast<Analyzer::Constant*>(cast_oper->get_own_operand().get());
278  if (!const_expr) {
279  // Analyzer casts dictionary encoded columns to none encoded if there is a comparison
280  // between two encoded columns. Which we currently do not handle.
281  return nullptr;
282  }
283  const auto& const_val = const_expr->get_constval();
284 
285  const auto col_ti = col_var->get_type_info();
286  CHECK(col_ti.is_string());
287  CHECK_EQ(kENCODING_DICT, col_ti.get_compression());
288  const auto sdp = executor()->getStringDictionaryProxy(
289  col_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
290 
291  if (sdp->storageEntryCount() > 200000000) {
292  std::runtime_error("Cardinality for string dictionary is too high");
293  return nullptr;
294  }
295 
296  const auto& pattern_str = *const_val.stringval;
297  const auto matching_ids = get_compared_ids(sdp, compare_opr, pattern_str);
298 
299  // InIntegerSet requires 64-bit values
300  std::vector<int64_t> matching_ids_64(matching_ids.size());
301  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
302 
303  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
304  col_var, matching_ids_64, col_ti.get_notnull());
305  return codegen(in_values.get(), co);
306 }
307 
309  const CompilationOptions& co) {
310  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
311  throw std::runtime_error("REGEXP not supported for unnested expressions");
312  }
313  char escape_char{'\\'};
314  if (expr->get_escape_expr()) {
315  auto escape_char_expr =
316  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
317  CHECK(escape_char_expr);
318  CHECK(escape_char_expr->get_type_info().is_string());
319  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
320  escape_char = (*escape_char_expr->get_constval().stringval)[0];
321  }
322  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_pattern_expr());
323  CHECK(pattern);
324  auto fast_dict_pattern_lv =
325  codegenDictRegexp(expr->get_own_arg(), pattern, escape_char, co);
326  if (fast_dict_pattern_lv) {
327  return fast_dict_pattern_lv;
328  }
329  const auto& ti = expr->get_arg()->get_type_info();
330  CHECK(ti.is_string());
331  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
332  throw WatchdogException(
333  "Cannot do REGEXP_LIKE on this dictionary encoded column, its cardinality is too "
334  "high");
335  }
336  // Now we know we are working on NONE ENCODED column. So switch back to CPU
338  throw QueryMustRunOnCpu();
339  }
340  auto str_lv = codegen(expr->get_arg(), true, co);
341  if (str_lv.size() != 3) {
342  CHECK_EQ(size_t(1), str_lv.size());
343  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
344  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
345  }
346  auto regexp_expr_arg_lvs = codegen(expr->get_pattern_expr(), true, co);
347  CHECK_EQ(size_t(3), regexp_expr_arg_lvs.size());
348  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
349  std::vector<llvm::Value*> regexp_args{
350  str_lv[1], str_lv[2], regexp_expr_arg_lvs[1], regexp_expr_arg_lvs[2]};
351  std::string fn_name("regexp_like");
352  regexp_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
353  if (is_nullable) {
354  fn_name += "_nullable";
355  regexp_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
357  fn_name, get_int_type(8, cgen_state_->context_), regexp_args);
358  }
360  fn_name, get_int_type(1, cgen_state_->context_), regexp_args);
361 }
362 
364  const std::shared_ptr<Analyzer::Expr> pattern_arg,
365  const Analyzer::Constant* pattern,
366  const char escape_char,
367  const CompilationOptions& co) {
368  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(pattern_arg);
369  if (!cast_oper) {
370  return nullptr;
371  }
372  CHECK(cast_oper);
373  CHECK_EQ(kCAST, cast_oper->get_optype());
374  const auto dict_regexp_arg = cast_oper->get_own_operand();
375  const auto& dict_regexp_arg_ti = dict_regexp_arg->get_type_info();
376  CHECK(dict_regexp_arg_ti.is_string());
377  CHECK_EQ(kENCODING_DICT, dict_regexp_arg_ti.get_compression());
378  const auto comp_param = dict_regexp_arg_ti.get_comp_param();
379  const auto sdp = executor()->getStringDictionaryProxy(
380  comp_param, executor()->getRowSetMemoryOwner(), true);
381  if (sdp->storageEntryCount() > 15000000) {
382  return nullptr;
383  }
384  const auto& pattern_ti = pattern->get_type_info();
385  CHECK(pattern_ti.is_string());
386  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
387  const auto& pattern_datum = pattern->get_constval();
388  const auto& pattern_str = *pattern_datum.stringval;
389  const auto matching_ids = sdp->getRegexpLike(pattern_str, escape_char);
390  // InIntegerSet requires 64-bit values
391  std::vector<int64_t> matching_ids_64(matching_ids.size());
392  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
393  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
394  dict_regexp_arg, matching_ids_64, dict_regexp_arg_ti.get_notnull());
395  return codegen(in_values.get(), co);
396 }
int32_t getIdOfString(const std::string &str) const
std::pair< char *, size_t > getStringBytes(int32_t string_id) const noexcept
#define CHECK_EQ(x, y)
Definition: Logger.h:195
int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
bool get_is_simple() const
Definition: Analyzer.h:802
std::vector< int32_t > get_compared_ids(const StringDictionaryProxy *dict, const SQLOps compare_operator, const std::string &pattern)
bool get_calc_encoded_length() const
Definition: Analyzer.h:651
CgenState * cgen_state_
bool is_null
Definition: sqltypes.h:73
Executor * executor() const
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:330
SQLOps
Definition: sqldefs.h:29
Definition: sqldefs.h:35
Definition: sqldefs.h:36
Definition: sqldefs.h:49
Definition: sqldefs.h:30
const Analyzer::Expr * extract_cast_arg(const Analyzer::Expr *expr)
Definition: Execute.h:149
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:181
int8_t * pointer
Definition: sqltypes.h:72
const Expr * get_arg() const
Definition: Analyzer.h:698
const Expr * get_arg() const
Definition: Analyzer.h:797
Datum get_constval() const
Definition: Analyzer.h:328
llvm::LLVMContext & context_
Definition: CgenState.h:266
Classes representing a parse tree.
const Expr * get_escape_expr() const
Definition: Analyzer.h:800
int32_t extract_str_len_noinline(const uint64_t str_and_len)
llvm::ConstantInt * inlineIntNull(const SQLTypeInfo &)
Definition: CgenState.cpp:24
llvm::Value * emitExternalCall(const std::string &fname, llvm::Type *ret_type, const std::vector< llvm::Value *> args, const std::vector< llvm::Attribute::AttrKind > &fnattrs={})
Definition: CgenState.h:203
uint64_t string_decode(int8_t *chunk_iter_, int64_t pos)
Definition: StringOpsIR.cpp:23
llvm::Value * codegenDictStrCmp(const std::shared_ptr< Analyzer::Expr >, const std::shared_ptr< Analyzer::Expr >, const SQLOps, const CompilationOptions &co)
#define NULL_INT
Definition: sqltypes.h:173
llvm::Value * codegenDictRegexp(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const char escape_char, const CompilationOptions &)
ExecutorDeviceType device_type_
std::string * stringval
Definition: sqltypes.h:131
Definition: sqldefs.h:34
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:25
uint64_t string_decompress(const int32_t string_id, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:34
const Expr * get_arg() const
Definition: Analyzer.h:869
const Expr * get_pattern_expr() const
Definition: Analyzer.h:871
const Expr * get_escape_expr() const
Definition: Analyzer.h:872
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value *> &args)
Definition: CgenState.cpp:134
Definition: sqldefs.h:32
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:870
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:77
#define CHECK(condition)
Definition: Logger.h:187
Definition: sqldefs.h:31
const Expr * get_arg() const
Definition: Analyzer.h:649
bool g_enable_watchdog
Definition: Execute.cpp:69
Definition: sqldefs.h:33
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator) const
bool is_unnest(const Analyzer::Expr *expr)
Definition: Execute.h:1059
const Expr * get_like_expr() const
Definition: Analyzer.h:799
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:798
int32_t string_compress(const int64_t ptr_and_len, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:47
bool get_is_ilike() const
Definition: Analyzer.h:801
llvm::ConstantInt * llInt(const T v) const
Definition: CgenState.h:247
llvm::Value * codegenDictLike(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const bool ilike, const bool is_simple, const char escape_char, const CompilationOptions &)
size_t length
Definition: sqltypes.h:71