OmniSciDB  c0231cc57d
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringDictionaryProxy.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef STRINGDICTIONARY_STRINGDICTIONARYPROXY_H
18 #define STRINGDICTIONARY_STRINGDICTIONARYPROXY_H
19 
20 #include "Logger/Logger.h" // For CHECK macros
21 #include "StringDictionary.h"
22 
23 #include "ThirdParty/robin_hood/robin_hood.h"
24 
25 #include <optional>
26 #include <ostream>
27 #include <shared_mutex>
28 #include <string>
29 #include <string_view>
30 #include <tuple>
31 #include <vector>
32 
33 namespace StringOps_Namespace {
34 struct StringOpInfo;
35 }
36 
37 // used to access a StringDictionary when transient strings are involved
39  public:
42  StringDictionaryProxy(std::shared_ptr<StringDictionary> sd,
43  const int32_t string_dict_id,
44  const int64_t generation);
45 
46  int32_t getDictId() const noexcept { return string_dict_id_; };
47 
48  bool operator==(StringDictionaryProxy const&) const;
49  bool operator!=(StringDictionaryProxy const&) const;
50 
51  int32_t getOrAdd(const std::string& str) noexcept;
52  StringDictionary* getDictionary() const noexcept;
53  int64_t getGeneration() const noexcept;
54 
75  std::vector<int32_t> getTransientBulk(const std::vector<std::string>& strings) const;
76  int32_t getOrAddTransient(const std::string& str);
77  // Not currently used
78  std::vector<int32_t> getOrAddTransientBulk(const std::vector<std::string>& strings);
79  int32_t getIdOfString(const std::string& str) const;
81  const std::string& str) const; // disregard generation, only used by QueryRenderer
82  std::string getString(int32_t string_id) const;
83  std::vector<std::string> getStrings(const std::vector<int32_t>& string_ids) const;
84  std::pair<const char*, size_t> getStringBytes(int32_t string_id) const noexcept;
85 
86  class IdMap {
87  size_t const offset_;
88  std::vector<int32_t> vector_map_;
89  int64_t num_untranslated_strings_{-1};
90  int32_t range_start_{0};
91  int32_t range_end_{0};
92 
93  public:
94  // +1 is added to skip string_id=-1 reserved for INVALID_STR_ID. id_map[-1]==-1.
95  IdMap(uint32_t const tran_size, uint32_t const dict_size)
96  : offset_(tran_size + 1)
97  , vector_map_(offset_ + dict_size, StringDictionary::INVALID_STR_ID) {}
98  IdMap(IdMap const&) = delete;
99  IdMap(IdMap&&) = default;
100  bool empty() const { return vector_map_.size() == 1; }
101  inline size_t getIndex(int32_t const id) const { return offset_ + id; }
102  std::vector<int32_t> const& getVectorMap() const { return vector_map_; }
103  size_t size() const { return vector_map_.size(); }
104  size_t numTransients() const { return offset_ - 1; }
105  size_t numNonTransients() const { return vector_map_.size() - offset_; }
106  int32_t* data() { return vector_map_.data(); }
107  int32_t const* data() const { return vector_map_.data(); }
108  int32_t domainStart() const { return -static_cast<int32_t>(offset_); }
109  int32_t domainEnd() const { return static_cast<int32_t>(numNonTransients()); }
110  void setRangeStart(const int32_t range_start) { range_start_ = range_start; }
111  void setRangeEnd(const int32_t range_end) { range_end_ = range_end; }
112  int32_t rangeStart() const { return range_start_; }
113  int32_t rangeEnd() const { return range_end_; }
114 
115  // Next two methods are currently used by buildUnionTranslationMapToOtherProxy to
116  // short circuit iteration over ids after intersection translation if all
117  // ids translated. Currently the private num_untranslated_strings_ is initialized
118  // to a -1 sentinel to signify that the value has not been calculated, which we
119  // CHECK against in the getter numUntranslatedStrings() method
120  // to represent that the num_untranslated_strings_ field has been uninitialized
121  size_t numUntranslatedStrings() const {
122  CHECK_GE(num_untranslated_strings_, 0L);
123  return static_cast<size_t>(num_untranslated_strings_);
124  }
125  void setNumUntranslatedStrings(const size_t num_untranslated_strings) {
126  num_untranslated_strings_ = static_cast<int64_t>(num_untranslated_strings);
127  }
128  int32_t* storageData() { return vector_map_.data() + offset_; }
129  int32_t& operator[](int32_t const id) { return vector_map_[getIndex(id)]; }
130  int32_t operator[](int32_t const id) const { return vector_map_[getIndex(id)]; }
131  friend std::ostream& operator<<(std::ostream&, IdMap const&);
132  };
133 
135 
156  const StringDictionaryProxy* dest_proxy,
157  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const;
158 
160  StringDictionaryProxy* dest_proxy,
161  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_types) const;
162 
172  size_t storageEntryCount() const;
173 
180  size_t transientEntryCount() const;
181 
190  size_t entryCount() const;
191 
192  void updateGeneration(const int64_t generation) noexcept;
193 
194  std::vector<int32_t> getLike(const std::string& pattern,
195  const bool icase,
196  const bool is_simple,
197  const char escape) const;
198 
199  std::vector<int32_t> getCompare(const std::string& pattern,
200  const std::string& comp_operator) const;
201 
202  std::vector<int32_t> getRegexpLike(const std::string& pattern, const char escape) const;
203 
205  using is_transparent = void; // Used by robin_hood to activate heterogenous hashing
206  // std::string and char const* are implicitly cast to std::string_view.
207  size_t operator()(std::string_view const key) const {
208  return robin_hood::hash_bytes(key.data(), key.size());
209  }
210  };
212  using is_transparent = void; // Used by robin_hood to activate heterogenous equal
213  // std::string and char const* are implicitly cast to std::string_view.
214  bool operator()(std::string_view const lhs, std::string_view const rhs) const {
215  return lhs == rhs;
216  }
217  };
218 
219  // The std::string must live in the map, and std::string const* in the vector. As
220  // desirable as it might be to have it the other way, string addresses won't change
221  // in the robin_hood::unordered_node_map when new strings are added, but may change
222  // in a std::vector (and robin_hood::unordered_flat_map).
223  using TransientMap = robin_hood::unordered_node_map<std::string,
224  int32_t,
225  HeterogeneousStringHash,
227 
228  const std::vector<std::string const*>& getTransientVector() const {
229  return transient_string_vec_;
230  }
231 
232  // INVALID_STR_ID = -1 is reserved for invalid string_ids.
233  // Thus the greatest valid transient string_id is -2.
234  static unsigned transientIdToIndex(int32_t const id) {
235  constexpr int max_transient_string_id = -2;
236  return static_cast<unsigned>(max_transient_string_id - id);
237  }
238 
239  static int32_t transientIndexToId(unsigned const index) {
240  constexpr int max_transient_string_id = -2;
241  return static_cast<int32_t>(max_transient_string_id - index);
242  }
243 
244  // Iterate over transient strings, then non-transients.
246 
247  // Union strings from both StringDictionaryProxies into *this as transients.
248  // Return map of old string_ids to new string_ids.
250 
251  private:
252  std::string getStringUnlocked(const int32_t string_id) const;
253  size_t transientEntryCountUnlocked() const;
254  size_t entryCountUnlocked() const;
255  size_t persistedC() const;
256  template <typename String>
257  int32_t lookupTransientStringUnlocked(const String& lookup_string) const;
258  size_t getTransientBulkImpl(const std::vector<std::string>& strings,
259  int32_t* string_ids,
260  const bool take_read_lock) const;
261  template <typename String>
262  size_t transientLookupBulk(const std::vector<String>& lookup_strings,
263  int32_t* string_ids,
264  const bool take_read_lock) const;
265  template <typename String>
266  size_t transientLookupBulkUnlocked(const std::vector<String>& lookup_strings,
267  int32_t* string_ids) const;
268  template <typename String>
269  size_t transientLookupBulkParallelUnlocked(const std::vector<String>& lookup_strings,
270  int32_t* string_ids) const;
271 
273  const StringDictionaryProxy* dest_proxy,
274  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const;
275 
276  std::shared_ptr<StringDictionary> string_dict_;
277  const int32_t string_dict_id_;
279  // Holds pointers into transient_str_to_int_
280  std::vector<std::string const*> transient_string_vec_;
281  int64_t generation_;
283 
284  // Return INVALID_STR_ID if not found on string_dict_. Don't lock or check transients.
285  template <typename String>
286  int32_t getIdOfStringFromClient(String const&) const;
287  template <typename String>
288  int32_t getOrAddTransientUnlocked(String const&);
289 
290  friend class StringLocalCallback;
291  friend class StringNetworkCallback;
292 };
293 #endif // STRINGDICTIONARY_STRINGDICTIONARYPROXY_H
void eachStringSerially(StringDictionary::StringCallback &) const
std::pair< const char *, size_t > getStringBytes(int32_t string_id) const noexcept
std::vector< int32_t > getLike(const std::string &pattern, const bool icase, const bool is_simple, const char escape) const
size_t transientEntryCountUnlocked() const
const std::vector< std::string const * > & getTransientVector() const
size_t entryCount() const
Returns the number of total string entries for this proxy, both stored in the underlying dictionary a...
int32_t getIdOfStringNoGeneration(const std::string &str) const
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
std::vector< int32_t > vector_map_
std::string getStringUnlocked(const int32_t string_id) const
size_t storageEntryCount() const
Returns the number of string entries in the underlying string dictionary, at this proxy&#39;s generation_...
StringDictionary * getDictionary() const noexcept
#define CHECK_GE(x, y)
Definition: Logger.h:235
size_t transientLookupBulkUnlocked(const std::vector< String > &lookup_strings, int32_t *string_ids) const
StringDictionaryProxy const & operator=(StringDictionaryProxy const &)=delete
size_t transientLookupBulk(const std::vector< String > &lookup_strings, int32_t *string_ids, const bool take_read_lock) const
std::string getString(int32_t string_id) const
void setNumUntranslatedStrings(const size_t num_untranslated_strings)
IdMap buildIntersectionTranslationMapToOtherProxyUnlocked(const StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos) const
size_t transientLookupBulkParallelUnlocked(const std::vector< String > &lookup_strings, int32_t *string_ids) const
int32_t getIdOfStringFromClient(String const &) const
std::vector< int32_t > getTransientBulk(const std::vector< std::string > &strings) const
Executes read-only lookup of a vector of strings and returns a vector of their integer ids...
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator) const
std::shared_ptr< StringDictionary > string_dict_
IdMap transientUnion(StringDictionaryProxy const &)
std::vector< std::string const * > transient_string_vec_
int32_t lookupTransientStringUnlocked(const String &lookup_string) const
void setRangeEnd(const int32_t range_end)
std::vector< std::string > getStrings(const std::vector< int32_t > &string_ids) const
size_t getTransientBulkImpl(const std::vector< std::string > &strings, int32_t *string_ids, const bool take_read_lock) const
size_t operator()(std::string_view const key) const
static int32_t transientIndexToId(unsigned const index)
void updateGeneration(const int64_t generation) noexcept
size_t transientEntryCount() const
Returns the number of transient string entries for this proxy,.
IdMap buildUnionTranslationMapToOtherProxy(StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_types) const
StringDictionaryProxy(StringDictionaryProxy const &)=delete
int32_t const * data() const
std::vector< int32_t > const & getVectorMap() const
int32_t getOrAddTransientUnlocked(String const &)
int32_t operator[](int32_t const id) const
bool operator!=(StringDictionaryProxy const &) const
std::vector< int32_t > getRegexpLike(const std::string &pattern, const char escape) const
int32_t getOrAdd(const std::string &str) noexcept
bool operator==(StringDictionaryProxy const &) const
int32_t getDictId() const noexcept
std::vector< int32_t > getOrAddTransientBulk(const std::vector< std::string > &strings)
IdMap buildIntersectionTranslationMapToOtherProxy(const StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos) const
Builds a vectorized string_id translation map from this proxy to dest_proxy.
int32_t getOrAddTransient(const std::string &str)
void setRangeStart(const int32_t range_start)
robin_hood::unordered_node_map< std::string, int32_t, HeterogeneousStringHash, HeterogeneousStringEqual > TransientMap
int32_t & operator[](int32_t const id)
nvtxRangeId_t range_start(const char *)
Definition: nvtx_helpers.h:247
std::shared_timed_mutex shared_mutex
void range_end(nvtxRangeId_t)
Definition: nvtx_helpers.h:253
bool operator()(std::string_view const lhs, std::string_view const rhs) const
size_t persistedC() const
int32_t getIdOfString(const std::string &str) const
static unsigned transientIdToIndex(int32_t const id)
int64_t getGeneration() const noexcept
size_t getIndex(int32_t const id) const
IdMap(uint32_t const tran_size, uint32_t const dict_size)