OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringOpsIR.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
19 
20 #include "../Shared/sqldefs.h"
21 #include "Parser/ParserNode.h"
22 
23 #include <boost/locale/conversion.hpp>
24 
25 extern "C" uint64_t string_decode(int8_t* chunk_iter_, int64_t pos) {
26  auto chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
27  VarlenDatum vd;
28  bool is_end;
29  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
30  CHECK(!is_end);
31  return vd.is_null ? 0
32  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
33  (static_cast<uint64_t>(vd.length) << 48);
34 }
35 
36 extern "C" uint64_t string_decompress(const int32_t string_id,
37  const int64_t string_dict_handle) {
38  if (string_id == NULL_INT) {
39  return 0;
40  }
41  auto string_dict_proxy =
42  reinterpret_cast<const StringDictionaryProxy*>(string_dict_handle);
43  auto string_bytes = string_dict_proxy->getStringBytes(string_id);
44  CHECK(string_bytes.first);
45  return (reinterpret_cast<uint64_t>(string_bytes.first) & 0xffffffffffff) |
46  (static_cast<uint64_t>(string_bytes.second) << 48);
47 }
48 
49 extern "C" int32_t string_compress(const int64_t ptr_and_len,
50  const int64_t string_dict_handle) {
51  std::string raw_str(reinterpret_cast<char*>(extract_str_ptr_noinline(ptr_and_len)),
52  extract_str_len_noinline(ptr_and_len));
53  auto string_dict_proxy =
54  reinterpret_cast<const StringDictionaryProxy*>(string_dict_handle);
55  return string_dict_proxy->getIdOfString(raw_str);
56 }
57 
58 extern "C" int32_t lower_encoded(int32_t string_id, int64_t string_dict_proxy_address) {
59  StringDictionaryProxy* string_dict_proxy =
60  reinterpret_cast<StringDictionaryProxy*>(string_dict_proxy_address);
61  auto str = string_dict_proxy->getString(string_id);
62  return string_dict_proxy->getOrAddTransient(boost::locale::to_lower(str));
63 }
64 
66  const CompilationOptions& co) {
67  auto str_lv = codegen(expr->get_arg(), true, co);
68  if (str_lv.size() != 3) {
69  CHECK_EQ(size_t(1), str_lv.size());
70  if (g_enable_watchdog) {
71  throw WatchdogException(
72  "LENGTH / CHAR_LENGTH on dictionary-encoded strings would be slow");
73  }
74  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
75  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
76  if (co.device_type_ == ExecutorDeviceType::GPU) {
77  throw QueryMustRunOnCpu();
78  }
79  }
80  std::vector<llvm::Value*> charlength_args{str_lv[1], str_lv[2]};
81  std::string fn_name("char_length");
82  if (expr->get_calc_encoded_length()) {
83  fn_name += "_encoded";
84  }
85  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
86  if (is_nullable) {
87  fn_name += "_nullable";
88  charlength_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
89  }
90  return expr->get_calc_encoded_length()
91  ? cgen_state_->emitExternalCall(
92  fn_name, get_int_type(32, cgen_state_->context_), charlength_args)
93  : cgen_state_->emitCall(fn_name, charlength_args);
94 }
95 
97  const CompilationOptions& co) {
98  auto str_lv = codegen(expr->get_arg(), true, co);
99  CHECK_EQ(size_t(1), str_lv.size());
100  return cgen_state_->emitCall("key_for_string_encoded", str_lv);
101 }
102 
104  const CompilationOptions& co) {
106  throw QueryMustRunOnCpu();
107  }
108 
109  auto str_id_lv = codegen(expr->get_arg(), true, co);
110  CHECK_EQ(size_t(1), str_id_lv.size());
111 
112  const auto string_dictionary_proxy = executor()->getStringDictionaryProxy(
113  expr->get_type_info().get_comp_param(), executor()->getRowSetMemoryOwner(), true);
114  CHECK(string_dictionary_proxy);
115 
116  std::vector<llvm::Value*> args{
117  str_id_lv[0],
118  cgen_state_->llInt(reinterpret_cast<int64_t>(string_dictionary_proxy))};
119 
121  "lower_encoded", get_int_type(32, cgen_state_->context_), args);
122 }
123 
125  const CompilationOptions& co) {
126  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
127  throw std::runtime_error("LIKE not supported for unnested expressions");
128  }
129  char escape_char{'\\'};
130  if (expr->get_escape_expr()) {
131  auto escape_char_expr =
132  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
133  CHECK(escape_char_expr);
134  CHECK(escape_char_expr->get_type_info().is_string());
135  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
136  escape_char = (*escape_char_expr->get_constval().stringval)[0];
137  }
138  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_like_expr());
139  CHECK(pattern);
140  auto fast_dict_like_lv = codegenDictLike(expr->get_own_arg(),
141  pattern,
142  expr->get_is_ilike(),
143  expr->get_is_simple(),
144  escape_char,
145  co);
146  if (fast_dict_like_lv) {
147  return fast_dict_like_lv;
148  }
149  const auto& ti = expr->get_arg()->get_type_info();
150  CHECK(ti.is_string());
151  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
152  throw WatchdogException(
153  "Cannot do LIKE / ILIKE on this dictionary encoded column, its cardinality is "
154  "too high");
155  }
156  auto str_lv = codegen(expr->get_arg(), true, co);
157  if (str_lv.size() != 3) {
158  CHECK_EQ(size_t(1), str_lv.size());
159  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
160  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
161  if (co.device_type_ == ExecutorDeviceType::GPU) {
162  throw QueryMustRunOnCpu();
163  }
164  }
165  auto like_expr_arg_lvs = codegen(expr->get_like_expr(), true, co);
166  CHECK_EQ(size_t(3), like_expr_arg_lvs.size());
167  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
168  std::vector<llvm::Value*> str_like_args{
169  str_lv[1], str_lv[2], like_expr_arg_lvs[1], like_expr_arg_lvs[2]};
170  std::string fn_name{expr->get_is_ilike() ? "string_ilike" : "string_like"};
171  if (expr->get_is_simple()) {
172  fn_name += "_simple";
173  } else {
174  str_like_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
175  }
176  if (is_nullable) {
177  fn_name += "_nullable";
178  str_like_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
179  }
180  return cgen_state_->emitCall(fn_name, str_like_args);
181 }
182 
184  const std::shared_ptr<Analyzer::Expr> like_arg,
185  const Analyzer::Constant* pattern,
186  const bool ilike,
187  const bool is_simple,
188  const char escape_char,
189  const CompilationOptions& co) {
190  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(like_arg);
191  if (!cast_oper) {
192  return nullptr;
193  }
194  CHECK(cast_oper);
195  CHECK_EQ(kCAST, cast_oper->get_optype());
196  const auto dict_like_arg = cast_oper->get_own_operand();
197  const auto& dict_like_arg_ti = dict_like_arg->get_type_info();
198  if (!dict_like_arg_ti.is_string()) {
199  throw(std::runtime_error("Cast from " + dict_like_arg_ti.get_type_name() + " to " +
200  cast_oper->get_type_info().get_type_name() +
201  " not supported"));
202  }
203  CHECK_EQ(kENCODING_DICT, dict_like_arg_ti.get_compression());
204  const auto sdp = executor()->getStringDictionaryProxy(
205  dict_like_arg_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
206  if (sdp->storageEntryCount() > 200000000) {
207  return nullptr;
208  }
209  const auto& pattern_ti = pattern->get_type_info();
210  CHECK(pattern_ti.is_string());
211  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
212  const auto& pattern_datum = pattern->get_constval();
213  const auto& pattern_str = *pattern_datum.stringval;
214  const auto matching_ids = sdp->getLike(pattern_str, ilike, is_simple, escape_char);
215  // InIntegerSet requires 64-bit values
216  std::vector<int64_t> matching_ids_64(matching_ids.size());
217  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
218  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
219  dict_like_arg, matching_ids_64, dict_like_arg_ti.get_notnull());
220  return codegen(in_values.get(), co);
221 }
222 
223 namespace {
224 
225 std::vector<int32_t> get_compared_ids(const StringDictionaryProxy* dict,
226  const SQLOps compare_operator,
227  const std::string& pattern) {
228  std::vector<int> ret;
229  switch (compare_operator) {
230  case kLT:
231  ret = dict->getCompare(pattern, "<");
232  break;
233  case kLE:
234  ret = dict->getCompare(pattern, "<=");
235  break;
236  case kEQ:
237  case kBW_EQ:
238  ret = dict->getCompare(pattern, "=");
239  break;
240  case kGT:
241  ret = dict->getCompare(pattern, ">");
242  break;
243  case kGE:
244  ret = dict->getCompare(pattern, ">=");
245  break;
246  case kNE:
247  ret = dict->getCompare(pattern, "<>");
248  break;
249  default:
250  std::runtime_error("unsuported operator for string comparision");
251  }
252  return ret;
253 }
254 } // namespace
255 
256 llvm::Value* CodeGenerator::codegenDictStrCmp(const std::shared_ptr<Analyzer::Expr> lhs,
257  const std::shared_ptr<Analyzer::Expr> rhs,
258  const SQLOps compare_operator,
259  const CompilationOptions& co) {
260  auto rhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(rhs);
261  auto lhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(lhs);
262  auto rhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(rhs);
263  auto lhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(lhs);
264  std::shared_ptr<const Analyzer::UOper> cast_oper;
265  std::shared_ptr<const Analyzer::ColumnVar> col_var;
266  auto compare_opr = compare_operator;
267  if (lhs_col_var && rhs_col_var) {
268  if (lhs_col_var->get_type_info().get_comp_param() ==
269  rhs_col_var->get_type_info().get_comp_param()) {
270  if (compare_operator == kEQ || compare_operator == kNE) {
271  // TODO (vraj): implement compare between two dictionary encoded columns which
272  // share a dictionary
273  return nullptr;
274  }
275  }
276  // TODO (vraj): implement compare between two dictionary encoded columns which don't
277  // shared dictionary
278  throw std::runtime_error("Decoding two Dictionary encoded columns will be slow");
279  } else if (lhs_col_var && rhs_cast_oper) {
280  cast_oper.swap(rhs_cast_oper);
281  col_var.swap(lhs_col_var);
282  } else if (lhs_cast_oper && rhs_col_var) {
283  cast_oper.swap(lhs_cast_oper);
284  col_var.swap(rhs_col_var);
285  switch (compare_operator) {
286  case kLT:
287  compare_opr = kGT;
288  break;
289  case kLE:
290  compare_opr = kGE;
291  break;
292  case kGT:
293  compare_opr = kLT;
294  break;
295  case kGE:
296  compare_opr = kLE;
297  default:
298  break;
299  }
300  }
301  if (!cast_oper || !col_var) {
302  return nullptr;
303  }
304  CHECK_EQ(kCAST, cast_oper->get_optype());
305 
306  const auto const_expr =
307  dynamic_cast<Analyzer::Constant*>(cast_oper->get_own_operand().get());
308  if (!const_expr) {
309  // Analyzer casts dictionary encoded columns to none encoded if there is a comparison
310  // between two encoded columns. Which we currently do not handle.
311  return nullptr;
312  }
313  const auto& const_val = const_expr->get_constval();
314 
315  const auto col_ti = col_var->get_type_info();
316  CHECK(col_ti.is_string());
317  CHECK_EQ(kENCODING_DICT, col_ti.get_compression());
318  const auto sdp = executor()->getStringDictionaryProxy(
319  col_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
320 
321  if (sdp->storageEntryCount() > 200000000) {
322  std::runtime_error("Cardinality for string dictionary is too high");
323  return nullptr;
324  }
325 
326  const auto& pattern_str = *const_val.stringval;
327  const auto matching_ids = get_compared_ids(sdp, compare_opr, pattern_str);
328 
329  // InIntegerSet requires 64-bit values
330  std::vector<int64_t> matching_ids_64(matching_ids.size());
331  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
332 
333  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
334  col_var, matching_ids_64, col_ti.get_notnull());
335  return codegen(in_values.get(), co);
336 }
337 
339  const CompilationOptions& co) {
340  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
341  throw std::runtime_error("REGEXP not supported for unnested expressions");
342  }
343  char escape_char{'\\'};
344  if (expr->get_escape_expr()) {
345  auto escape_char_expr =
346  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
347  CHECK(escape_char_expr);
348  CHECK(escape_char_expr->get_type_info().is_string());
349  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
350  escape_char = (*escape_char_expr->get_constval().stringval)[0];
351  }
352  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_pattern_expr());
353  CHECK(pattern);
354  auto fast_dict_pattern_lv =
355  codegenDictRegexp(expr->get_own_arg(), pattern, escape_char, co);
356  if (fast_dict_pattern_lv) {
357  return fast_dict_pattern_lv;
358  }
359  const auto& ti = expr->get_arg()->get_type_info();
360  CHECK(ti.is_string());
361  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
362  throw WatchdogException(
363  "Cannot do REGEXP_LIKE on this dictionary encoded column, its cardinality is too "
364  "high");
365  }
366  // Now we know we are working on NONE ENCODED column. So switch back to CPU
368  throw QueryMustRunOnCpu();
369  }
370  auto str_lv = codegen(expr->get_arg(), true, co);
371  if (str_lv.size() != 3) {
372  CHECK_EQ(size_t(1), str_lv.size());
373  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
374  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
375  }
376  auto regexp_expr_arg_lvs = codegen(expr->get_pattern_expr(), true, co);
377  CHECK_EQ(size_t(3), regexp_expr_arg_lvs.size());
378  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
379  std::vector<llvm::Value*> regexp_args{
380  str_lv[1], str_lv[2], regexp_expr_arg_lvs[1], regexp_expr_arg_lvs[2]};
381  std::string fn_name("regexp_like");
382  regexp_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
383  if (is_nullable) {
384  fn_name += "_nullable";
385  regexp_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
386  return cgen_state_->emitExternalCall(
387  fn_name, get_int_type(8, cgen_state_->context_), regexp_args);
388  }
389  return cgen_state_->emitExternalCall(
390  fn_name, get_int_type(1, cgen_state_->context_), regexp_args);
391 }
392 
394  const std::shared_ptr<Analyzer::Expr> pattern_arg,
395  const Analyzer::Constant* pattern,
396  const char escape_char,
397  const CompilationOptions& co) {
398  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(pattern_arg);
399  if (!cast_oper) {
400  return nullptr;
401  }
402  CHECK(cast_oper);
403  CHECK_EQ(kCAST, cast_oper->get_optype());
404  const auto dict_regexp_arg = cast_oper->get_own_operand();
405  const auto& dict_regexp_arg_ti = dict_regexp_arg->get_type_info();
406  CHECK(dict_regexp_arg_ti.is_string());
407  CHECK_EQ(kENCODING_DICT, dict_regexp_arg_ti.get_compression());
408  const auto comp_param = dict_regexp_arg_ti.get_comp_param();
409  const auto sdp = executor()->getStringDictionaryProxy(
410  comp_param, executor()->getRowSetMemoryOwner(), true);
411  if (sdp->storageEntryCount() > 15000000) {
412  return nullptr;
413  }
414  const auto& pattern_ti = pattern->get_type_info();
415  CHECK(pattern_ti.is_string());
416  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
417  const auto& pattern_datum = pattern->get_constval();
418  const auto& pattern_str = *pattern_datum.stringval;
419  const auto matching_ids = sdp->getRegexpLike(pattern_str, escape_char);
420  // InIntegerSet requires 64-bit values
421  std::vector<int64_t> matching_ids_64(matching_ids.size());
422  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
423  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
424  dict_regexp_arg, matching_ids_64, dict_regexp_arg_ti.get_notnull());
425  return codegen(in_values.get(), co);
426 }
std::pair< char *, size_t > getStringBytes(int32_t string_id) const noexcept
#define CHECK_EQ(x, y)
Definition: Logger.h:198
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:927
int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
std::vector< int32_t > get_compared_ids(const StringDictionaryProxy *dict, const SQLOps compare_operator, const std::string &pattern)
const Expr * get_escape_expr() const
Definition: Analyzer.h:857
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:335
CgenState * cgen_state_
bool is_null
Definition: sqltypes.h:76
const Expr * get_escape_expr() const
Definition: Analyzer.h:929
SQLOps
Definition: sqldefs.h:29
Definition: sqldefs.h:35
Definition: sqldefs.h:36
Definition: sqldefs.h:49
Definition: sqldefs.h:30
const Expr * get_arg() const
Definition: Analyzer.h:926
const Analyzer::Expr * extract_cast_arg(const Analyzer::Expr *expr)
Definition: Execute.h:152
std::string getString(int32_t string_id) const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:181
const Expr * get_arg() const
Definition: Analyzer.h:854
int8_t * pointer
Definition: sqltypes.h:75
bool g_enable_watchdog
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator) const
llvm::LLVMContext & context_
Definition: CgenState.h:267
CHECK(cgen_state)
Classes representing a parse tree.
int32_t extract_str_len_noinline(const uint64_t str_and_len)
bool get_is_simple() const
Definition: Analyzer.h:859
llvm::Value * codegenDictStrCmp(const std::shared_ptr< Analyzer::Expr >, const std::shared_ptr< Analyzer::Expr >, const SQLOps, const CompilationOptions &co)
#define NULL_INT
Definition: sqltypes.h:176
llvm::Value * codegenDictRegexp(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const char escape_char, const CompilationOptions &)
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
ExecutorDeviceType device_type_
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
Definition: CgenState.cpp:134
std::string * stringval
Definition: sqltypes.h:134
Definition: sqldefs.h:34
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:25
const Expr * get_pattern_expr() const
Definition: Analyzer.h:928
llvm::Value * emitExternalCall(const std::string &fname, llvm::Type *ret_type, const std::vector< llvm::Value * > args, const std::vector< llvm::Attribute::AttrKind > &fnattrs={})
Definition: CgenState.h:205
uint64_t string_decompress(const int32_t string_id, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:36
const Expr * get_like_expr() const
Definition: Analyzer.h:856
int32_t lower_encoded(int32_t string_id, int64_t string_dict_proxy_address)
Definition: StringOpsIR.cpp:58
Datum get_constval() const
Definition: Analyzer.h:329
Definition: sqldefs.h:32
const Expr * get_arg() const
Definition: Analyzer.h:650
Expression class for the LOWER (lowercase) string function. The &quot;arg&quot; constructor parameter must be a...
Definition: Analyzer.h:740
const Expr * get_arg() const
Definition: Analyzer.h:699
llvm::ConstantInt * llInt(const T v) const
Definition: CgenState.h:248
Definition: sqldefs.h:31
int32_t getOrAddTransient(const std::string &str)
Definition: sqldefs.h:33
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:855
bool is_unnest(const Analyzer::Expr *expr)
Definition: Execute.h:1090
int32_t string_compress(const int64_t ptr_and_len, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:49
__device__ uint64_t string_decode(int8_t *chunk_iter_, int64_t pos)
int32_t getIdOfString(const std::string &str) const
const Expr * get_arg() const
Definition: Analyzer.h:744
bool get_is_ilike() const
Definition: Analyzer.h:858
llvm::Value * codegenDictLike(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const bool ilike, const bool is_simple, const char escape_char, const CompilationOptions &)
size_t length
Definition: sqltypes.h:74
Executor * executor() const