OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ExtensionFunctionsText.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <cstring>
18 #include "Shared/toString.h"
19 #include "heavydbTypes.h"
20 
21 // To-Do: strtok_to_array with default "delimiters" value
22 
23 #ifndef __CUDACC__
24 std::vector<std::string> __strtok_to_array(const std::string& text,
25  const std::string& delimiters) {
26  std::vector<std::string> vec;
27 
28  char* str = const_cast<char*>(text.c_str());
29  const char* del = delimiters.c_str();
30 
31  char* substr = strtok(str, del);
32  while (substr != NULL) {
33  std::string s(substr);
34  vec.emplace_back(s);
35  substr = strtok(NULL, del);
36  }
37 
38  return vec;
39 }
40 
43  TextEncodingNone& text,
44  TextEncodingNone& delimiters) {
45  /*
46  Rules
47  -----
48  * If either parameters is NULL => a NULL is returned
49  * An empty array is returned if tokenization produces no tokens
50 
51  Note
52  ----
53  <delimiters> argument is optional on snowflake but HeavyDB dont' support
54  default values on UDFs at the moment. See:
55  https://github.com/heavyai/heavydb-internal/pull/6651
56 
57  Examples
58  --------
59  > select strtok_to_array('a.b.c', '.');
60  {a, b, c}
61 
62  > select strtok_to_array('user@gmail.com', '.@')
63  {user, gmail, com}
64 
65  > select strtok_to_array('', '.')
66  NULL
67 
68  > select strtok_to_array('a.b.c', '')
69  NULL
70  */
71 
72  if (text.isNull() || delimiters.isNull()) {
73  return Array<TextEncodingDict>(0, true);
74  }
75 
76  const auto& vec = __strtok_to_array(text.getString(), delimiters.getString());
77  Array<TextEncodingDict> out_arr(vec.size());
78  for (size_t i = 0; i < vec.size(); ++i) {
79  out_arr[i] = mgr.getOrAddTransient(TRANSIENT_DICT_DB_ID, TRANSIENT_DICT_ID, vec[i]);
80  }
81  return out_arr;
82 }
83 
86  TextEncodingDict text,
87  TextEncodingNone& delimiters) {
88  if (text.isNull() || delimiters.isNull()) {
89  return Array<TextEncodingDict>(0, true);
90  }
91 
92  std::string str = mgr.getString(GET_DICT_DB_ID(mgr, 0), GET_DICT_ID(mgr, 0), text);
93  const auto& vec = __strtok_to_array(str, delimiters.getString());
94  Array<TextEncodingDict> out_arr(vec.size());
95  for (size_t i = 0; i < vec.size(); ++i) {
96  out_arr[i] = mgr.getOrAddTransient(TRANSIENT_DICT_DB_ID, TRANSIENT_DICT_ID, vec[i]);
97  }
98  return out_arr;
99 }
100 #endif // #ifndef __CUDACC__
#define EXTENSION_NOINLINE
Definition: heavydbTypes.h:52
std::string getString() const
Definition: heavydbTypes.h:311
#define TRANSIENT_DICT_DB_ID
Definition: DbObjectKeys.h:25
#define TRANSIENT_DICT_ID
Definition: DbObjectKeys.h:24
#define GET_DICT_ID(mgr, arg_idx)
Definition: heavydbTypes.h:128
DEVICE ALWAYS_INLINE bool isNull() const
Definition: heavydbTypes.h:165
#define GET_DICT_DB_ID(mgr, arg_idx)
Definition: heavydbTypes.h:126
int32_t getOrAddTransient(int32_t db_id, int32_t dict_id, std::string str)
DEVICE ALWAYS_INLINE bool isNull() const
Definition: heavydbTypes.h:335
EXTENSION_NOINLINE Array< TextEncodingDict > strtok_to_array(RowFunctionManager &mgr, TextEncodingNone &text, TextEncodingNone &delimiters)
std::string getString(int32_t db_id, int32_t dict_id, int32_t string_id)
EXTENSION_NOINLINE Array< TextEncodingDict > strtok_to_array__1(RowFunctionManager &mgr, TextEncodingDict text, TextEncodingNone &delimiters)
std::vector< std::string > __strtok_to_array(const std::string &text, const std::string &delimiters)