OmniSciDB  340b00dbf6
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringOpsIR.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
19 
20 #include "../Shared/sqldefs.h"
21 #include "Parser/ParserNode.h"
22 
23 #include <boost/locale/conversion.hpp>
24 
25 extern "C" uint64_t string_decode(int8_t* chunk_iter_, int64_t pos) {
26  auto chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
27  VarlenDatum vd;
28  bool is_end;
29  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
30  CHECK(!is_end);
31  return vd.is_null ? 0
32  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
33  (static_cast<uint64_t>(vd.length) << 48);
34 }
35 
36 extern "C" uint64_t string_decompress(const int32_t string_id,
37  const int64_t string_dict_handle) {
38  if (string_id == NULL_INT) {
39  return 0;
40  }
41  auto string_dict_proxy =
42  reinterpret_cast<const StringDictionaryProxy*>(string_dict_handle);
43  auto string_bytes = string_dict_proxy->getStringBytes(string_id);
44  CHECK(string_bytes.first);
45  return (reinterpret_cast<uint64_t>(string_bytes.first) & 0xffffffffffff) |
46  (static_cast<uint64_t>(string_bytes.second) << 48);
47 }
48 
49 extern "C" int32_t string_compress(const int64_t ptr_and_len,
50  const int64_t string_dict_handle) {
51  std::string raw_str(reinterpret_cast<char*>(extract_str_ptr_noinline(ptr_and_len)),
52  extract_str_len_noinline(ptr_and_len));
53  auto string_dict_proxy =
54  reinterpret_cast<const StringDictionaryProxy*>(string_dict_handle);
55  return string_dict_proxy->getIdOfString(raw_str);
56 }
57 
58 extern "C" int32_t lower_encoded(int32_t string_id, int64_t string_dict_proxy_address) {
59  StringDictionaryProxy* string_dict_proxy =
60  reinterpret_cast<StringDictionaryProxy*>(string_dict_proxy_address);
61  auto str = string_dict_proxy->getString(string_id);
62  return string_dict_proxy->getOrAddTransient(boost::locale::to_lower(str));
63 }
64 
66  const CompilationOptions& co) {
68  auto str_lv = codegen(expr->get_arg(), true, co);
69  if (str_lv.size() != 3) {
70  CHECK_EQ(size_t(1), str_lv.size());
71  if (g_enable_watchdog) {
72  throw WatchdogException(
73  "LENGTH / CHAR_LENGTH on dictionary-encoded strings would be slow");
74  }
75  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
76  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
77  if (co.device_type == ExecutorDeviceType::GPU) {
78  throw QueryMustRunOnCpu();
79  }
80  }
81  std::vector<llvm::Value*> charlength_args{str_lv[1], str_lv[2]};
82  std::string fn_name("char_length");
83  if (expr->get_calc_encoded_length()) {
84  fn_name += "_encoded";
85  }
86  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
87  if (is_nullable) {
88  fn_name += "_nullable";
89  charlength_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
90  }
91  return expr->get_calc_encoded_length()
92  ? cgen_state_->emitExternalCall(
93  fn_name, get_int_type(32, cgen_state_->context_), charlength_args)
94  : cgen_state_->emitCall(fn_name, charlength_args);
95 }
96 
98  const CompilationOptions& co) {
100  auto str_lv = codegen(expr->get_arg(), true, co);
101  CHECK_EQ(size_t(1), str_lv.size());
102  return cgen_state_->emitCall("key_for_string_encoded", str_lv);
103 }
104 
106  const CompilationOptions& co) {
109  throw QueryMustRunOnCpu();
110  }
111 
112  auto str_id_lv = codegen(expr->get_arg(), true, co);
113  CHECK_EQ(size_t(1), str_id_lv.size());
114 
115  const auto string_dictionary_proxy = executor()->getStringDictionaryProxy(
116  expr->get_type_info().get_comp_param(), executor()->getRowSetMemoryOwner(), true);
117  CHECK(string_dictionary_proxy);
118 
119  std::vector<llvm::Value*> args{
120  str_id_lv[0],
121  cgen_state_->llInt(reinterpret_cast<int64_t>(string_dictionary_proxy))};
122 
124  "lower_encoded", get_int_type(32, cgen_state_->context_), args);
125 }
126 
128  const CompilationOptions& co) {
130  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
131  throw std::runtime_error("LIKE not supported for unnested expressions");
132  }
133  char escape_char{'\\'};
134  if (expr->get_escape_expr()) {
135  auto escape_char_expr =
136  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
137  CHECK(escape_char_expr);
138  CHECK(escape_char_expr->get_type_info().is_string());
139  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
140  escape_char = (*escape_char_expr->get_constval().stringval)[0];
141  }
142  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_like_expr());
143  CHECK(pattern);
144  auto fast_dict_like_lv = codegenDictLike(expr->get_own_arg(),
145  pattern,
146  expr->get_is_ilike(),
147  expr->get_is_simple(),
148  escape_char,
149  co);
150  if (fast_dict_like_lv) {
151  return fast_dict_like_lv;
152  }
153  const auto& ti = expr->get_arg()->get_type_info();
154  CHECK(ti.is_string());
155  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
156  throw WatchdogException(
157  "Cannot do LIKE / ILIKE on this dictionary encoded column, its cardinality is "
158  "too high");
159  }
160  auto str_lv = codegen(expr->get_arg(), true, co);
161  if (str_lv.size() != 3) {
162  CHECK_EQ(size_t(1), str_lv.size());
163  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
164  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
165  if (co.device_type == ExecutorDeviceType::GPU) {
166  throw QueryMustRunOnCpu();
167  }
168  }
169  auto like_expr_arg_lvs = codegen(expr->get_like_expr(), true, co);
170  CHECK_EQ(size_t(3), like_expr_arg_lvs.size());
171  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
172  std::vector<llvm::Value*> str_like_args{
173  str_lv[1], str_lv[2], like_expr_arg_lvs[1], like_expr_arg_lvs[2]};
174  std::string fn_name{expr->get_is_ilike() ? "string_ilike" : "string_like"};
175  if (expr->get_is_simple()) {
176  fn_name += "_simple";
177  } else {
178  str_like_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
179  }
180  if (is_nullable) {
181  fn_name += "_nullable";
182  str_like_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
183  }
184  return cgen_state_->emitCall(fn_name, str_like_args);
185 }
186 
188  const std::shared_ptr<Analyzer::Expr> like_arg,
189  const Analyzer::Constant* pattern,
190  const bool ilike,
191  const bool is_simple,
192  const char escape_char,
193  const CompilationOptions& co) {
195  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(like_arg);
196  if (!cast_oper) {
197  return nullptr;
198  }
199  CHECK(cast_oper);
200  CHECK_EQ(kCAST, cast_oper->get_optype());
201  const auto dict_like_arg = cast_oper->get_own_operand();
202  const auto& dict_like_arg_ti = dict_like_arg->get_type_info();
203  if (!dict_like_arg_ti.is_string()) {
204  throw(std::runtime_error("Cast from " + dict_like_arg_ti.get_type_name() + " to " +
205  cast_oper->get_type_info().get_type_name() +
206  " not supported"));
207  }
208  CHECK_EQ(kENCODING_DICT, dict_like_arg_ti.get_compression());
209  const auto sdp = executor()->getStringDictionaryProxy(
210  dict_like_arg_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
211  if (sdp->storageEntryCount() > 200000000) {
212  return nullptr;
213  }
214  const auto& pattern_ti = pattern->get_type_info();
215  CHECK(pattern_ti.is_string());
216  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
217  const auto& pattern_datum = pattern->get_constval();
218  const auto& pattern_str = *pattern_datum.stringval;
219  const auto matching_ids = sdp->getLike(pattern_str, ilike, is_simple, escape_char);
220  // InIntegerSet requires 64-bit values
221  std::vector<int64_t> matching_ids_64(matching_ids.size());
222  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
223  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
224  dict_like_arg, matching_ids_64, dict_like_arg_ti.get_notnull());
225  return codegen(in_values.get(), co);
226 }
227 
228 namespace {
229 
230 std::vector<int32_t> get_compared_ids(const StringDictionaryProxy* dict,
231  const SQLOps compare_operator,
232  const std::string& pattern) {
233  std::vector<int> ret;
234  switch (compare_operator) {
235  case kLT:
236  ret = dict->getCompare(pattern, "<");
237  break;
238  case kLE:
239  ret = dict->getCompare(pattern, "<=");
240  break;
241  case kEQ:
242  case kBW_EQ:
243  ret = dict->getCompare(pattern, "=");
244  break;
245  case kGT:
246  ret = dict->getCompare(pattern, ">");
247  break;
248  case kGE:
249  ret = dict->getCompare(pattern, ">=");
250  break;
251  case kNE:
252  ret = dict->getCompare(pattern, "<>");
253  break;
254  default:
255  std::runtime_error("unsuported operator for string comparision");
256  }
257  return ret;
258 }
259 } // namespace
260 
261 llvm::Value* CodeGenerator::codegenDictStrCmp(const std::shared_ptr<Analyzer::Expr> lhs,
262  const std::shared_ptr<Analyzer::Expr> rhs,
263  const SQLOps compare_operator,
264  const CompilationOptions& co) {
266  auto rhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(rhs);
267  auto lhs_cast_oper = std::dynamic_pointer_cast<const Analyzer::UOper>(lhs);
268  auto rhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(rhs);
269  auto lhs_col_var = std::dynamic_pointer_cast<const Analyzer::ColumnVar>(lhs);
270  std::shared_ptr<const Analyzer::UOper> cast_oper;
271  std::shared_ptr<const Analyzer::ColumnVar> col_var;
272  auto compare_opr = compare_operator;
273  if (lhs_col_var && rhs_col_var) {
274  if (lhs_col_var->get_type_info().get_comp_param() ==
275  rhs_col_var->get_type_info().get_comp_param()) {
276  if (compare_operator == kEQ || compare_operator == kNE) {
277  // TODO (vraj): implement compare between two dictionary encoded columns which
278  // share a dictionary
279  return nullptr;
280  }
281  }
282  // TODO (vraj): implement compare between two dictionary encoded columns which don't
283  // shared dictionary
284  throw std::runtime_error("Decoding two Dictionary encoded columns will be slow");
285  } else if (lhs_col_var && rhs_cast_oper) {
286  cast_oper.swap(rhs_cast_oper);
287  col_var.swap(lhs_col_var);
288  } else if (lhs_cast_oper && rhs_col_var) {
289  cast_oper.swap(lhs_cast_oper);
290  col_var.swap(rhs_col_var);
291  switch (compare_operator) {
292  case kLT:
293  compare_opr = kGT;
294  break;
295  case kLE:
296  compare_opr = kGE;
297  break;
298  case kGT:
299  compare_opr = kLT;
300  break;
301  case kGE:
302  compare_opr = kLE;
303  default:
304  break;
305  }
306  }
307  if (!cast_oper || !col_var) {
308  return nullptr;
309  }
310  CHECK_EQ(kCAST, cast_oper->get_optype());
311 
312  const auto const_expr =
313  dynamic_cast<Analyzer::Constant*>(cast_oper->get_own_operand().get());
314  if (!const_expr) {
315  // Analyzer casts dictionary encoded columns to none encoded if there is a comparison
316  // between two encoded columns. Which we currently do not handle.
317  return nullptr;
318  }
319  const auto& const_val = const_expr->get_constval();
320 
321  const auto col_ti = col_var->get_type_info();
322  CHECK(col_ti.is_string());
323  CHECK_EQ(kENCODING_DICT, col_ti.get_compression());
324  const auto sdp = executor()->getStringDictionaryProxy(
325  col_ti.get_comp_param(), executor()->getRowSetMemoryOwner(), true);
326 
327  if (sdp->storageEntryCount() > 200000000) {
328  std::runtime_error("Cardinality for string dictionary is too high");
329  return nullptr;
330  }
331 
332  const auto& pattern_str = *const_val.stringval;
333  const auto matching_ids = get_compared_ids(sdp, compare_opr, pattern_str);
334 
335  // InIntegerSet requires 64-bit values
336  std::vector<int64_t> matching_ids_64(matching_ids.size());
337  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
338 
339  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
340  col_var, matching_ids_64, col_ti.get_notnull());
341  return codegen(in_values.get(), co);
342 }
343 
345  const CompilationOptions& co) {
347  if (is_unnest(extract_cast_arg(expr->get_arg()))) {
348  throw std::runtime_error("REGEXP not supported for unnested expressions");
349  }
350  char escape_char{'\\'};
351  if (expr->get_escape_expr()) {
352  auto escape_char_expr =
353  dynamic_cast<const Analyzer::Constant*>(expr->get_escape_expr());
354  CHECK(escape_char_expr);
355  CHECK(escape_char_expr->get_type_info().is_string());
356  CHECK_EQ(size_t(1), escape_char_expr->get_constval().stringval->size());
357  escape_char = (*escape_char_expr->get_constval().stringval)[0];
358  }
359  auto pattern = dynamic_cast<const Analyzer::Constant*>(expr->get_pattern_expr());
360  CHECK(pattern);
361  auto fast_dict_pattern_lv =
362  codegenDictRegexp(expr->get_own_arg(), pattern, escape_char, co);
363  if (fast_dict_pattern_lv) {
364  return fast_dict_pattern_lv;
365  }
366  const auto& ti = expr->get_arg()->get_type_info();
367  CHECK(ti.is_string());
368  if (g_enable_watchdog && ti.get_compression() != kENCODING_NONE) {
369  throw WatchdogException(
370  "Cannot do REGEXP_LIKE on this dictionary encoded column, its cardinality is too "
371  "high");
372  }
373  // Now we know we are working on NONE ENCODED column. So switch back to CPU
375  throw QueryMustRunOnCpu();
376  }
377  auto str_lv = codegen(expr->get_arg(), true, co);
378  if (str_lv.size() != 3) {
379  CHECK_EQ(size_t(1), str_lv.size());
380  str_lv.push_back(cgen_state_->emitCall("extract_str_ptr", {str_lv.front()}));
381  str_lv.push_back(cgen_state_->emitCall("extract_str_len", {str_lv.front()}));
382  }
383  auto regexp_expr_arg_lvs = codegen(expr->get_pattern_expr(), true, co);
384  CHECK_EQ(size_t(3), regexp_expr_arg_lvs.size());
385  const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
386  std::vector<llvm::Value*> regexp_args{
387  str_lv[1], str_lv[2], regexp_expr_arg_lvs[1], regexp_expr_arg_lvs[2]};
388  std::string fn_name("regexp_like");
389  regexp_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
390  if (is_nullable) {
391  fn_name += "_nullable";
392  regexp_args.push_back(cgen_state_->inlineIntNull(expr->get_type_info()));
393  return cgen_state_->emitExternalCall(
394  fn_name, get_int_type(8, cgen_state_->context_), regexp_args);
395  }
396  return cgen_state_->emitExternalCall(
397  fn_name, get_int_type(1, cgen_state_->context_), regexp_args);
398 }
399 
401  const std::shared_ptr<Analyzer::Expr> pattern_arg,
402  const Analyzer::Constant* pattern,
403  const char escape_char,
404  const CompilationOptions& co) {
406  const auto cast_oper = std::dynamic_pointer_cast<Analyzer::UOper>(pattern_arg);
407  if (!cast_oper) {
408  return nullptr;
409  }
410  CHECK(cast_oper);
411  CHECK_EQ(kCAST, cast_oper->get_optype());
412  const auto dict_regexp_arg = cast_oper->get_own_operand();
413  const auto& dict_regexp_arg_ti = dict_regexp_arg->get_type_info();
414  CHECK(dict_regexp_arg_ti.is_string());
415  CHECK_EQ(kENCODING_DICT, dict_regexp_arg_ti.get_compression());
416  const auto comp_param = dict_regexp_arg_ti.get_comp_param();
417  const auto sdp = executor()->getStringDictionaryProxy(
418  comp_param, executor()->getRowSetMemoryOwner(), true);
419  if (sdp->storageEntryCount() > 15000000) {
420  return nullptr;
421  }
422  const auto& pattern_ti = pattern->get_type_info();
423  CHECK(pattern_ti.is_string());
424  CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
425  const auto& pattern_datum = pattern->get_constval();
426  const auto& pattern_str = *pattern_datum.stringval;
427  const auto matching_ids = sdp->getRegexpLike(pattern_str, escape_char);
428  // InIntegerSet requires 64-bit values
429  std::vector<int64_t> matching_ids_64(matching_ids.size());
430  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
431  const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
432  dict_regexp_arg, matching_ids_64, dict_regexp_arg_ti.get_notnull());
433  return codegen(in_values.get(), co);
434 }
std::string to_lower(const std::string &str)
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::pair< const char *, size_t > getStringBytes(int32_t string_id) const noexcept
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:978
bool g_enable_watchdog
std::vector< int32_t > get_compared_ids(const StringDictionaryProxy *dict, const SQLOps compare_operator, const std::string &pattern)
const Expr * get_escape_expr() const
Definition: Analyzer.h:908
CgenState * cgen_state_
bool is_null
Definition: sqltypes.h:147
const Expr * get_escape_expr() const
Definition: Analyzer.h:980
llvm::Value * emitExternalCall(const std::string &fname, llvm::Type *ret_type, const std::vector< llvm::Value * > args, const std::vector< llvm::Attribute::AttrKind > &fnattrs={}, const bool has_struct_return=false)
Definition: CgenState.h:222
SQLOps
Definition: sqldefs.h:29
Definition: sqldefs.h:35
Definition: sqldefs.h:36
Definition: sqldefs.h:49
Definition: sqldefs.h:30
const Expr * get_arg() const
Definition: Analyzer.h:977
const Analyzer::Expr * extract_cast_arg(const Analyzer::Expr *expr)
Definition: Execute.h:164
std::string getString(int32_t string_id) const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:181
const Expr * get_arg() const
Definition: Analyzer.h:905
int8_t * pointer
Definition: sqltypes.h:146
NEVER_INLINE int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator) const
llvm::LLVMContext & context_
Definition: CgenState.h:327
Classes representing a parse tree.
bool get_is_simple() const
Definition: Analyzer.h:910
llvm::Value * codegenDictStrCmp(const std::shared_ptr< Analyzer::Expr >, const std::shared_ptr< Analyzer::Expr >, const SQLOps, const CompilationOptions &co)
#define NULL_INT
Definition: sqltypes.h:254
llvm::Value * codegenDictRegexp(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const char escape_char, const CompilationOptions &)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
NEVER_INLINE int32_t extract_str_len_noinline(const uint64_t str_and_len)
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
Definition: CgenState.cpp:137
std::string * stringval
Definition: sqltypes.h:214
ExecutorDeviceType device_type
Definition: sqldefs.h:34
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:26
const Expr * get_pattern_expr() const
Definition: Analyzer.h:979
uint64_t string_decompress(const int32_t string_id, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:36
const Expr * get_like_expr() const
Definition: Analyzer.h:907
int32_t lower_encoded(int32_t string_id, int64_t string_dict_proxy_address)
Definition: StringOpsIR.cpp:58
Datum get_constval() const
Definition: Analyzer.h:335
Definition: sqldefs.h:32
const Expr * get_arg() const
Definition: Analyzer.h:656
Expression class for the LOWER (lowercase) string function. The &quot;arg&quot; constructor parameter must be a...
Definition: Analyzer.h:791
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:339
const Expr * get_arg() const
Definition: Analyzer.h:705
llvm::ConstantInt * llInt(const T v) const
Definition: CgenState.h:300
#define CHECK(condition)
Definition: Logger.h:197
Definition: sqldefs.h:31
int32_t getOrAddTransient(const std::string &str)
Definition: sqldefs.h:33
const std::shared_ptr< Analyzer::Expr > get_own_arg() const
Definition: Analyzer.h:906
bool is_unnest(const Analyzer::Expr *expr)
Definition: Execute.h:1046
int32_t string_compress(const int64_t ptr_and_len, const int64_t string_dict_handle)
Definition: StringOpsIR.cpp:49
__device__ uint64_t string_decode(int8_t *chunk_iter_, int64_t pos)
int32_t getIdOfString(const std::string &str) const
const Expr * get_arg() const
Definition: Analyzer.h:795
bool get_is_ilike() const
Definition: Analyzer.h:909
llvm::Value * codegenDictLike(const std::shared_ptr< Analyzer::Expr > arg, const Analyzer::Constant *pattern, const bool ilike, const bool is_simple, const char escape_char, const CompilationOptions &)
size_t length
Definition: sqltypes.h:145
Executor * executor() const