OmniSciDB  eb3a3d0a03
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringDictionary.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef STRINGDICTIONARY_STRINGDICTIONARY_H
18 #define STRINGDICTIONARY_STRINGDICTIONARY_H
19 
20 #include "../Shared/mapd_shared_mutex.h"
21 #include "DictRef.h"
22 #include "DictionaryCache.hpp"
23 #include "LeafHostInfo.h"
24 
25 #include <future>
26 #include <map>
27 #include <string>
28 #include <tuple>
29 #include <vector>
30 
32 
34 
35 class DictPayloadUnavailable : public std::runtime_error {
36  public:
37  DictPayloadUnavailable() : std::runtime_error("DictPayloadUnavailable") {}
38 
39  DictPayloadUnavailable(const std::string& err) : std::runtime_error(err) {}
40 };
41 
42 using string_dict_hash_t = uint32_t;
43 
45  public:
46  StringDictionary(const std::string& folder,
47  const bool isTemp,
48  const bool recover,
49  const bool materializeHashes = false,
50  size_t initial_capacity = 256);
51  StringDictionary(const LeafHostInfo& host, const DictRef dict_ref);
52  ~StringDictionary() noexcept;
53 
54  int32_t getOrAdd(const std::string& str) noexcept;
55  template <class T, class String>
56  void getOrAddBulk(const std::vector<String>& string_vec, T* encoded_vec);
57  template <class T, class String>
58  void getOrAddBulkParallel(const std::vector<String>& string_vec, T* encoded_vec);
59  template <class String>
60  void getOrAddBulkArray(const std::vector<std::vector<String>>& string_array_vec,
61  std::vector<std::vector<int32_t>>& ids_array_vec);
62  int32_t getIdOfString(const std::string& str) const;
63  std::string getString(int32_t string_id) const;
64  std::pair<char*, size_t> getStringBytes(int32_t string_id) const noexcept;
65  size_t storageEntryCount() const;
66 
67  std::vector<int32_t> getLike(const std::string& pattern,
68  const bool icase,
69  const bool is_simple,
70  const char escape,
71  const size_t generation) const;
72 
73  std::vector<int32_t> getCompare(const std::string& pattern,
74  const std::string& comp_operator,
75  const size_t generation);
76 
77  std::vector<int32_t> getRegexpLike(const std::string& pattern,
78  const char escape,
79  const size_t generation) const;
80 
81  std::shared_ptr<const std::vector<std::string>> copyStrings() const;
82 
83  bool checkpoint() noexcept;
84 
102  static void populate_string_ids(
103  std::vector<int32_t>& dest_ids,
104  StringDictionary* dest_dict,
105  const std::vector<int32_t>& source_ids,
106  const StringDictionary* source_dict,
107  const std::map<int32_t, std::string> transient_mapping = {});
108 
109  static void populate_string_array_ids(
110  std::vector<std::vector<int32_t>>& dest_array_ids,
111  StringDictionary* dest_dict,
112  const std::vector<std::vector<int32_t>>& source_array_ids,
113  const StringDictionary* source_dict);
114 
115  static constexpr int32_t INVALID_STR_ID = -1;
116  static constexpr size_t MAX_STRLEN = (1 << 15) - 1;
117  static constexpr size_t MAX_STRCOUNT = (1U << 31) - 1;
118 
119  private:
120  struct StringIdxEntry {
121  uint64_t off : 48;
122  uint64_t size : 16;
123  };
124 
125  // In the compare_cache_value_t index represents the index of the sorted cache.
126  // The diff component represents whether the index the cache is pointing to is equal to
127  // the pattern it is cached for. We want to use diff so we don't have compare string
128  // again when we are retrieving it from the cache.
130  int32_t index;
131  int32_t diff;
132  };
133 
134  struct PayloadString {
135  char* c_str_ptr;
136  size_t size;
137  bool canary;
138  };
139 
141  std::vector<std::future<std::vector<std::pair<string_dict_hash_t, unsigned int>>>>&
142  dictionary_futures);
143  size_t getNumStringsFromStorage(const size_t storage_slots) const noexcept;
144  bool fillRateIsHigh(const size_t num_strings) const noexcept;
145  void increaseHashTableCapacity() noexcept;
146  template <class String>
148  const size_t str_count,
149  const size_t storage_high_water_mark,
150  const std::vector<String>& input_strings,
151  const std::vector<size_t>& string_memory_ids,
152  const std::vector<string_dict_hash_t>& input_strings_hashes) noexcept;
153  int32_t getOrAddImpl(const std::string_view& str) noexcept;
154  template <class String>
155  void hashStrings(const std::vector<String>& string_vec,
156  std::vector<string_dict_hash_t>& hashes) const noexcept;
157  template <class T, class String>
158  void getOrAddBulkRemote(const std::vector<String>& string_vec, T* encoded_vec);
159  int32_t getUnlocked(const std::string& str) const noexcept;
160  std::string getStringUnlocked(int32_t string_id) const noexcept;
161  std::string getStringChecked(const int string_id) const noexcept;
162  std::pair<char*, size_t> getStringBytesChecked(const int string_id) const noexcept;
163  template <class String>
164  uint32_t computeBucket(
166  const String& input_string,
167  const std::vector<int32_t>& string_id_string_dict_hash_table) const noexcept;
168  template <class String>
170  const string_dict_hash_t input_string_hash,
171  const String& input_string,
172  const std::vector<int32_t>& string_id_string_dict_hash_table,
173  const size_t storage_high_water_mark,
174  const std::vector<String>& input_strings,
175  const std::vector<size_t>& string_memory_ids) const noexcept;
178  const std::vector<int32_t>& string_id_string_dict_hash_table) noexcept;
179  void checkAndConditionallyIncreasePayloadCapacity(const size_t write_length);
180  void checkAndConditionallyIncreaseOffsetCapacity(const size_t write_length);
181 
182  template <class String>
183  void appendToStorage(const String str) noexcept;
184  template <class String>
185  void appendToStorageBulk(const std::vector<String>& input_strings,
186  const std::vector<size_t>& string_memory_ids,
187  const size_t sum_new_strings_lengths) noexcept;
188  PayloadString getStringFromStorage(const int string_id) const noexcept;
189  std::string_view getStringFromStorageFast(const int string_id) const noexcept;
190  void addPayloadCapacity(const size_t min_capacity_requested = 0) noexcept;
191  void addOffsetCapacity(const size_t min_capacity_requested = 0) noexcept;
192  size_t addStorageCapacity(int fd, const size_t min_capacity_requested = 0) noexcept;
193  void* addMemoryCapacity(void* addr,
194  size_t& mem_size,
195  const size_t min_capacity_requested = 0) noexcept;
196  void invalidateInvertedIndex() noexcept;
197  std::vector<int32_t> getEquals(std::string pattern,
198  std::string comp_operator,
199  size_t generation);
200  void buildSortedCache();
201  void insertInSortedCache(std::string str, int32_t str_id);
202  void sortCache(std::vector<int32_t>& cache);
203  void mergeSortedCache(std::vector<int32_t>& temp_sorted_cache);
204  compare_cache_value_t* binary_search_cache(const std::string& pattern) const;
205 
206  const std::string folder_;
207  size_t str_count_;
208  size_t collisions_;
209  std::vector<int32_t> string_id_string_dict_hash_table_;
211  std::vector<int32_t> sorted_cache;
212  bool isTemp_;
214  std::string offsets_path_;
223  mutable std::map<std::tuple<std::string, bool, bool, char>, std::vector<int32_t>>
225  mutable std::map<std::pair<std::string, char>, std::vector<int32_t>> regex_cache_;
226  mutable std::map<std::string, int32_t> equal_cache_;
228  mutable std::shared_ptr<std::vector<std::string>> strings_cache_;
229  std::unique_ptr<StringDictionaryClient> client_;
231 
232  char* CANARY_BUFFER{nullptr};
233  size_t canary_buffer_size = 0;
234 };
235 
236 int32_t truncate_to_generation(const int32_t id, const size_t generation);
237 
238 void translate_string_ids(std::vector<int32_t>& dest_ids,
239  const LeafHostInfo& dict_server_host,
240  const DictRef dest_dict_ref,
241  const std::vector<int32_t>& source_ids,
242  const DictRef source_dict_ref,
243  const int32_t dest_generation);
244 
245 #endif // STRINGDICTIONARY_STRINGDICTIONARY_H
StringIdxEntry * offset_map_
void increaseHashTableCapacity() noexcept
void checkAndConditionallyIncreasePayloadCapacity(const size_t write_length)
size_t addStorageCapacity(int fd, const size_t min_capacity_requested=0) noexcept
std::vector< int32_t > getRegexpLike(const std::string &pattern, const char escape, const size_t generation) const
uint64_t off
std::pair< char *, size_t > getStringBytesChecked(const int string_id) const noexcept
uint64_t size
#define const
size_t storageEntryCount() const
void addOffsetCapacity(const size_t min_capacity_requested=0) noexcept
uint32_t computeBucketFromStorageAndMemory(const string_dict_hash_t input_string_hash, const String &input_string, const std::vector< int32_t > &string_id_string_dict_hash_table, const size_t storage_high_water_mark, const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids) const noexcept
std::string getStringChecked(const int string_id) const noexcept
std::vector< string_dict_hash_t > hash_cache_
DictionaryCache< std::string, compare_cache_value_t > compare_cache_
bool fillRateIsHigh(const size_t num_strings) const noexcept
void * addMemoryCapacity(void *addr, size_t &mem_size, const size_t min_capacity_requested=0) noexcept
static void populate_string_ids(std::vector< int32_t > &dest_ids, StringDictionary *dest_dict, const std::vector< int32_t > &source_ids, const StringDictionary *source_dict, const std::map< int32_t, std::string > transient_mapping={})
Populates provided dest_ids vector with string ids corresponding to given source strings.
std::string offsets_path_
std::string_view getStringFromStorageFast(const int string_id) const noexcept
int32_t getIdOfString(const std::string &str) const
int32_t getOrAdd(const std::string &str) noexcept
int32_t getUnlocked(const std::string &str) const noexcept
std::map< std::pair< std::string, char >, std::vector< int32_t > > regex_cache_
std::unique_ptr< StringDictionaryClient > client_
std::string getStringUnlocked(int32_t string_id) const noexcept
StringDictionary(const std::string &folder, const bool isTemp, const bool recover, const bool materializeHashes=false, size_t initial_capacity=256)
static constexpr size_t MAX_STRCOUNT
std::vector< int32_t > getEquals(std::string pattern, std::string comp_operator, size_t generation)
uint32_t computeBucket(const string_dict_hash_t hash, const String &input_string, const std::vector< int32_t > &string_id_string_dict_hash_table) const noexcept
static constexpr int32_t INVALID_STR_ID
std::shared_ptr< std::vector< std::string > > strings_cache_
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator, const size_t generation)
std::shared_timed_mutex mapd_shared_mutex
const std::string folder_
mapd_shared_mutex rw_mutex_
void appendToStorageBulk(const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids, const size_t sum_new_strings_lengths) noexcept
void addPayloadCapacity(const size_t min_capacity_requested=0) noexcept
std::map< std::string, int32_t > equal_cache_
uint32_t computeUniqueBucketWithHash(const string_dict_hash_t hash, const std::vector< int32_t > &string_id_string_dict_hash_table) noexcept
void getOrAddBulkArray(const std::vector< std::vector< String >> &string_array_vec, std::vector< std::vector< int32_t >> &ids_array_vec)
void translate_string_ids(std::vector< int32_t > &dest_ids, const LeafHostInfo &dict_server_host, const DictRef dest_dict_ref, const std::vector< int32_t > &source_ids, const DictRef source_dict_ref, const int32_t dest_generation)
void appendToStorage(const String str) noexcept
void getOrAddBulk(const std::vector< String > &string_vec, T *encoded_vec)
std::pair< char *, size_t > getStringBytes(int32_t string_id) const noexcept
DictPayloadUnavailable(const std::string &err)
void processDictionaryFutures(std::vector< std::future< std::vector< std::pair< string_dict_hash_t, unsigned int >>>> &dictionary_futures)
bool checkpoint() noexcept
std::vector< int32_t > getLike(const std::string &pattern, const bool icase, const bool is_simple, const char escape, const size_t generation) const
void mergeSortedCache(std::vector< int32_t > &temp_sorted_cache)
size_t getNumStringsFromStorage(const size_t storage_slots) const noexcept
std::string getString(int32_t string_id) const
void hashStrings(const std::vector< String > &string_vec, std::vector< string_dict_hash_t > &hashes) const noexcept
std::unique_ptr< StringDictionaryClient > client_no_timeout_
void increaseHashTableCapacityFromStorageAndMemory(const size_t str_count, const size_t storage_high_water_mark, const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids, const std::vector< string_dict_hash_t > &input_strings_hashes) noexcept
void checkAndConditionallyIncreaseOffsetCapacity(const size_t write_length)
static void populate_string_array_ids(std::vector< std::vector< int32_t >> &dest_array_ids, StringDictionary *dest_dict, const std::vector< std::vector< int32_t >> &source_array_ids, const StringDictionary *source_dict)
void invalidateInvertedIndex() noexcept
uint32_t string_dict_hash_t
std::vector< int32_t > string_id_string_dict_hash_table_
void sortCache(std::vector< int32_t > &cache)
static constexpr size_t MAX_STRLEN
void getOrAddBulkParallel(const std::vector< String > &string_vec, T *encoded_vec)
std::map< std::tuple< std::string, bool, bool, char >, std::vector< int32_t > > like_cache_
bool g_enable_stringdict_parallel
PayloadString getStringFromStorage(const int string_id) const noexcept
compare_cache_value_t * binary_search_cache(const std::string &pattern) const
void insertInSortedCache(std::string str, int32_t str_id)
void getOrAddBulkRemote(const std::vector< String > &string_vec, T *encoded_vec)
std::vector< int32_t > sorted_cache
int32_t getOrAddImpl(const std::string_view &str) noexcept
int32_t truncate_to_generation(const int32_t id, const size_t generation)
~StringDictionary() noexcept
std::shared_ptr< const std::vector< std::string > > copyStrings() const