OmniSciDB  8a228a1076
StringDictionary.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef STRINGDICTIONARY_STRINGDICTIONARY_H
18 #define STRINGDICTIONARY_STRINGDICTIONARY_H
19 
20 #include "../Shared/mapd_shared_mutex.h"
21 #include "DictRef.h"
22 #include "DictionaryCache.hpp"
23 #include "LeafHostInfo.h"
24 
25 #include <future>
26 #include <map>
27 #include <string>
28 #include <tuple>
29 #include <vector>
30 
32 
34 
35 class DictPayloadUnavailable : public std::runtime_error {
36  public:
37  DictPayloadUnavailable() : std::runtime_error("DictPayloadUnavailable") {}
38 
39  DictPayloadUnavailable(const std::string& err) : std::runtime_error(err) {}
40 };
41 
43  public:
44  StringDictionary(const std::string& folder,
45  const bool isTemp,
46  const bool recover,
47  const bool materializeHashes = false,
48  size_t initial_capacity = 256);
49  StringDictionary(const LeafHostInfo& host, const DictRef dict_ref);
50  ~StringDictionary() noexcept;
51 
52  int32_t getOrAdd(const std::string& str) noexcept;
53  template <class T, class String>
54  void getOrAddBulk(const std::vector<String>& string_vec, T* encoded_vec);
55  template <class T, class String>
56  void getOrAddBulkParallel(const std::vector<String>& string_vec, T* encoded_vec);
57  template <class String>
58  void getOrAddBulkArray(const std::vector<std::vector<String>>& string_array_vec,
59  std::vector<std::vector<int32_t>>& ids_array_vec);
60  int32_t getIdOfString(const std::string& str) const;
61  std::string getString(int32_t string_id) const;
62  std::pair<char*, size_t> getStringBytes(int32_t string_id) const noexcept;
63  size_t storageEntryCount() const;
64 
65  std::vector<int32_t> getLike(const std::string& pattern,
66  const bool icase,
67  const bool is_simple,
68  const char escape,
69  const size_t generation) const;
70 
71  std::vector<int32_t> getCompare(const std::string& pattern,
72  const std::string& comp_operator,
73  const size_t generation);
74 
75  std::vector<int32_t> getRegexpLike(const std::string& pattern,
76  const char escape,
77  const size_t generation) const;
78 
79  std::shared_ptr<const std::vector<std::string>> copyStrings() const;
80 
81  bool checkpoint() noexcept;
82 
100  static void populate_string_ids(
101  std::vector<int32_t>& dest_ids,
102  StringDictionary* dest_dict,
103  const std::vector<int32_t>& source_ids,
104  const StringDictionary* source_dict,
105  const std::map<int32_t, std::string> transient_mapping = {});
106 
107  static void populate_string_array_ids(
108  std::vector<std::vector<int32_t>>& dest_array_ids,
109  StringDictionary* dest_dict,
110  const std::vector<std::vector<int32_t>>& source_array_ids,
111  const StringDictionary* source_dict);
112 
113  static constexpr int32_t INVALID_STR_ID = -1;
114  static constexpr size_t MAX_STRLEN = (1 << 15) - 1;
115  static constexpr size_t MAX_STRCOUNT = (1U << 31) - 1;
116 
117  private:
118  struct StringIdxEntry {
119  uint64_t off : 48;
120  uint64_t size : 16;
121  };
122 
123  // In the compare_cache_value_t index represents the index of the sorted cache.
124  // The diff component represents whether the index the cache is pointing to is equal to
125  // the pattern it is cached for. We want to use diff so we don't have compare string
126  // again when we are retrieving it from the cache.
128  int32_t index;
129  int32_t diff;
130  };
131 
132  struct PayloadString {
133  char* c_str_ptr;
134  size_t size;
135  bool canary;
136  };
137 
138  void processDictionaryFutures(
139  std::vector<std::future<std::vector<std::pair<uint32_t, unsigned int>>>>&
140  dictionary_futures);
141  size_t getNumStringsFromStorage(const size_t storage_slots) const noexcept;
142  bool fillRateIsHigh(const size_t num_strings) const noexcept;
143  void increaseCapacity() noexcept;
144  template <class String>
145  void increaseCapacityFromStorageAndMemory(
146  const size_t storage_high_water_mark,
147  const std::vector<String>& input_strings,
148  const std::vector<size_t>& string_memory_ids,
149  const std::vector<uint32_t>& input_strings_rk_hashes) noexcept;
150  int32_t getOrAddImpl(const std::string& str) noexcept;
151  template <class String>
152  void hashStrings(const std::vector<String>& string_vec,
153  std::vector<uint32_t>& hashes) const noexcept;
154  template <class T, class String>
155  void getOrAddBulkRemote(const std::vector<String>& string_vec, T* encoded_vec);
156  int32_t getUnlocked(const std::string& str) const noexcept;
157  std::string getStringUnlocked(int32_t string_id) const noexcept;
158  std::string getStringChecked(const int string_id) const noexcept;
159  std::pair<char*, size_t> getStringBytesChecked(const int string_id) const noexcept;
160  template <class String>
161  uint32_t computeBucket(const uint32_t hash,
162  const String& str,
163  const std::vector<int32_t>& data) const noexcept;
164  template <class String>
165  uint32_t computeBucketFromStorageAndMemory(
166  const uint32_t input_string_rk_hash,
167  const String& input_string,
168  const std::vector<int32_t>& string_id_hash_table,
169  const size_t storage_high_water_mark,
170  const std::vector<String>& input_strings,
171  const std::vector<size_t>& string_memory_ids) const noexcept;
172  uint32_t computeUniqueBucketWithHash(const uint32_t hash,
173  const std::vector<int32_t>& data) noexcept;
174  void checkAndConditionallyIncreasePayloadCapacity(const size_t write_length);
175  void checkAndConditionallyIncreaseOffsetCapacity(const size_t write_length);
176 
177  template <class String>
178  void appendToStorage(String str) noexcept;
179  template <class String>
180  void appendToStorageBulk(const std::vector<String>& input_strings,
181  const std::vector<size_t>& string_memory_ids,
182  const size_t sum_new_strings_lengths) noexcept;
183  PayloadString getStringFromStorage(const int string_id) const noexcept;
184  std::string_view getStringFromStorageFast(const int string_id) const noexcept;
185  void addPayloadCapacity(const size_t min_capacity_requested = 0) noexcept;
186  void addOffsetCapacity(const size_t min_capacity_requested = 0) noexcept;
187  size_t addStorageCapacity(int fd, const size_t min_capacity_requested = 0) noexcept;
188  void* addMemoryCapacity(void* addr,
189  size_t& mem_size,
190  const size_t min_capacity_requested = 0) noexcept;
191  void invalidateInvertedIndex() noexcept;
192  std::vector<int32_t> getEquals(std::string pattern,
193  std::string comp_operator,
194  size_t generation);
195  void buildSortedCache();
196  void insertInSortedCache(std::string str, int32_t str_id);
197  void sortCache(std::vector<int32_t>& cache);
198  void mergeSortedCache(std::vector<int32_t>& temp_sorted_cache);
199  compare_cache_value_t* binary_search_cache(const std::string& pattern) const;
200 
201  size_t str_count_;
202  size_t collisions_;
203  std::vector<int32_t> string_id_hash_table_;
204  std::vector<uint32_t> rk_hashes_;
205  std::vector<int32_t> sorted_cache;
206  bool isTemp_;
207  bool materialize_hashes_;
208  std::string offsets_path_;
209  int payload_fd_;
210  int offset_fd_;
211  StringIdxEntry* offset_map_;
212  char* payload_map_;
213  size_t offset_file_size_;
214  size_t payload_file_size_;
215  size_t payload_file_off_;
216  mutable mapd_shared_mutex rw_mutex_;
217  mutable std::map<std::tuple<std::string, bool, bool, char>, std::vector<int32_t>>
218  like_cache_;
219  mutable std::map<std::pair<std::string, char>, std::vector<int32_t>> regex_cache_;
220  mutable std::map<std::string, int32_t> equal_cache_;
221  mutable DictionaryCache<std::string, compare_cache_value_t> compare_cache_;
222  mutable std::shared_ptr<std::vector<std::string>> strings_cache_;
223  std::unique_ptr<StringDictionaryClient> client_;
224  std::unique_ptr<StringDictionaryClient> client_no_timeout_;
225 
226  char* CANARY_BUFFER{nullptr};
227  size_t canary_buffer_size = 0;
228 };
229 
230 int32_t truncate_to_generation(const int32_t id, const size_t generation);
231 
232 void translate_string_ids(std::vector<int32_t>& dest_ids,
233  const LeafHostInfo& dict_server_host,
234  const DictRef dest_dict_ref,
235  const std::vector<int32_t>& source_ids,
236  const DictRef source_dict_ref,
237  const int32_t dest_generation);
238 
239 #endif // STRINGDICTIONARY_STRINGDICTIONARY_H
uint64_t off
uint64_t size
void translate_string_ids(std::vector< int32_t > &dest_ids, const LeafHostInfo &dict_server_host, const DictRef dest_dict_ref, const std::vector< int32_t > &source_ids, const DictRef source_dict_ref, const int32_t dest_generation)
std::shared_timed_mutex mapd_shared_mutex
DictPayloadUnavailable(const std::string &err)
bool g_enable_stringdict_parallel
int32_t truncate_to_generation(const int32_t id, const size_t generation)