OmniSciDB  29e35f4d58
StringDictionary.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef STRINGDICTIONARY_STRINGDICTIONARY_H
18 #define STRINGDICTIONARY_STRINGDICTIONARY_H
19 
20 #include "../Shared/mapd_shared_mutex.h"
21 #include "DictRef.h"
22 #include "DictionaryCache.hpp"
23 
24 #include <future>
25 #include <map>
26 #include <string>
27 #include <tuple>
28 #include <vector>
29 
31 
32 class DictPayloadUnavailable : public std::runtime_error {
33  public:
34  DictPayloadUnavailable() : std::runtime_error("DictPayloadUnavailable") {}
35 
36  DictPayloadUnavailable(const std::string& err) : std::runtime_error(err) {}
37 };
38 
39 class LeafHostInfo;
40 
42  public:
43  StringDictionary(const std::string& folder,
44  const bool isTemp,
45  const bool recover,
46  const bool materializeHashes = false,
47  size_t initial_capacity = 256);
48  StringDictionary(const LeafHostInfo& host, const DictRef dict_ref);
49  ~StringDictionary() noexcept;
50 
51  int32_t getOrAdd(const std::string& str) noexcept;
52  template <class T>
53  void getOrAddBulk(const std::vector<std::string>& string_vec, T* encoded_vec);
54  void getOrAddBulkArray(const std::vector<std::vector<std::string>>& string_array_vec,
55  std::vector<std::vector<int32_t>>& ids_array_vec);
56  int32_t getIdOfString(const std::string& str) const;
57  std::string getString(int32_t string_id) const;
58  std::pair<char*, size_t> getStringBytes(int32_t string_id) const noexcept;
59  size_t storageEntryCount() const;
60 
61  std::vector<int32_t> getLike(const std::string& pattern,
62  const bool icase,
63  const bool is_simple,
64  const char escape,
65  const size_t generation) const;
66 
67  std::vector<int32_t> getCompare(const std::string& pattern,
68  const std::string& comp_operator,
69  const size_t generation);
70 
71  std::vector<int32_t> getRegexpLike(const std::string& pattern,
72  const char escape,
73  const size_t generation) const;
74 
75  std::shared_ptr<const std::vector<std::string>> copyStrings() const;
76 
77  bool checkpoint() noexcept;
78 
96  static void populate_string_ids(
97  std::vector<int32_t>& dest_ids,
98  StringDictionary* dest_dict,
99  const std::vector<int32_t>& source_ids,
100  const StringDictionary* source_dict,
101  const std::map<int32_t, std::string> transient_mapping = {});
102 
103  static void populate_string_array_ids(
104  std::vector<std::vector<int32_t>>& dest_array_ids,
105  StringDictionary* dest_dict,
106  const std::vector<std::vector<int32_t>>& source_array_ids,
107  const StringDictionary* source_dict);
108 
109  static constexpr int32_t INVALID_STR_ID = -1;
110  static constexpr size_t MAX_STRLEN = (1 << 15) - 1;
111  static constexpr size_t MAX_STRCOUNT = (1U << 31) - 1;
112 
113  private:
114  struct StringIdxEntry {
115  uint64_t off : 48;
116  uint64_t size : 16;
117  };
118 
119  // In the compare_cache_value_t index represents the index of the sorted cache.
120  // The diff component represents whether the index the cache is pointing to is equal to
121  // the pattern it is cached for. We want to use diff so we don't have compare string
122  // again when we are retrieving it from the cache.
124  int32_t index;
125  int32_t diff;
126  };
127 
128  struct PayloadString {
129  char* c_str_ptr;
130  size_t size;
131  bool canary;
132  };
133 
134  void processDictionaryFutures(
135  std::vector<std::future<std::vector<std::pair<uint32_t, unsigned int>>>>&
136  dictionary_futures);
137  bool fillRateIsHigh() const noexcept;
138  void increaseCapacity() noexcept;
139  int32_t getOrAddImpl(const std::string& str) noexcept;
140  template <class T>
141  void getOrAddBulkRemote(const std::vector<std::string>& string_vec, T* encoded_vec);
142  int32_t getUnlocked(const std::string& str) const noexcept;
143  std::string getStringUnlocked(int32_t string_id) const noexcept;
144  std::string getStringChecked(const int string_id) const noexcept;
145  std::pair<char*, size_t> getStringBytesChecked(const int string_id) const noexcept;
146  uint32_t computeBucket(const uint32_t hash,
147  const std::string str,
148  const std::vector<int32_t>& data,
149  const bool unique) const noexcept;
150  uint32_t computeUniqueBucketWithHash(const uint32_t hash,
151  const std::vector<int32_t>& data) const noexcept;
152  void appendToStorage(const std::string& str) noexcept;
153  PayloadString getStringFromStorage(const int string_id) const noexcept;
154  void addPayloadCapacity() noexcept;
155  void addOffsetCapacity() noexcept;
156  size_t addStorageCapacity(int fd) noexcept;
157  void* addMemoryCapacity(void* addr, size_t& mem_size) noexcept;
158  void invalidateInvertedIndex() noexcept;
159  std::vector<int32_t> getEquals(std::string pattern,
160  std::string comp_operator,
161  size_t generation);
162  void buildSortedCache();
163  void insertInSortedCache(std::string str, int32_t str_id);
164  void sortCache(std::vector<int32_t>& cache);
165  void mergeSortedCache(std::vector<int32_t>& temp_sorted_cache);
166  compare_cache_value_t* binary_search_cache(const std::string& pattern) const;
167 
168  size_t str_count_;
169  std::vector<int32_t> str_ids_;
170  std::vector<uint32_t> rk_hashes_;
171  std::vector<int32_t> sorted_cache;
172  bool isTemp_;
173  bool materialize_hashes_;
174  std::string offsets_path_;
175  int payload_fd_;
176  int offset_fd_;
177  StringIdxEntry* offset_map_;
178  char* payload_map_;
179  size_t offset_file_size_;
180  size_t payload_file_size_;
181  size_t payload_file_off_;
182  mutable mapd_shared_mutex rw_mutex_;
183  mutable std::map<std::tuple<std::string, bool, bool, char>, std::vector<int32_t>>
184  like_cache_;
185  mutable std::map<std::pair<std::string, char>, std::vector<int32_t>> regex_cache_;
186  mutable std::map<std::string, int32_t> equal_cache_;
187  mutable DictionaryCache<std::string, compare_cache_value_t> compare_cache_;
188  mutable std::shared_ptr<std::vector<std::string>> strings_cache_;
189  std::unique_ptr<StringDictionaryClient> client_;
190  std::unique_ptr<StringDictionaryClient> client_no_timeout_;
191 
192  static char* CANARY_BUFFER;
193 };
194 
195 int32_t truncate_to_generation(const int32_t id, const size_t generation);
196 
197 void translate_string_ids(std::vector<int32_t>& dest_ids,
198  const LeafHostInfo& dict_server_host,
199  const DictRef dest_dict_ref,
200  const std::vector<int32_t>& source_ids,
201  const DictRef source_dict_ref,
202  const int32_t dest_generation);
203 
204 #endif // STRINGDICTIONARY_STRINGDICTIONARY_H
uint64_t off
uint64_t size
void translate_string_ids(std::vector< int32_t > &dest_ids, const LeafHostInfo &dict_server_host, const DictRef dest_dict_ref, const std::vector< int32_t > &source_ids, const DictRef source_dict_ref, const int32_t dest_generation)
std::shared_timed_mutex mapd_shared_mutex
DictPayloadUnavailable(const std::string &err)
int32_t truncate_to_generation(const int32_t id, const size_t generation)