OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringDictionaryProxy.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #include "Logger/Logger.h"
20 #include "Shared/ThreadInfo.h"
21 #include "Shared/misc.h"
22 #include "Shared/sqltypes.h"
23 #include "Shared/thread_count.h"
25 #include "StringOps/StringOps.h"
26 #include "Utils/Regexp.h"
27 #include "Utils/StringLike.h"
28 
29 #include <tbb/parallel_for.h>
30 #include <tbb/task_arena.h>
31 
32 #include <algorithm>
33 #include <iomanip>
34 #include <iostream>
35 #include <string>
36 #include <string_view>
37 #include <thread>
38 
39 StringDictionaryProxy::StringDictionaryProxy(std::shared_ptr<StringDictionary> sd,
40  const int32_t string_dict_id,
41  const int64_t generation)
42  : string_dict_(sd), string_dict_id_(string_dict_id), generation_(generation) {}
43 
44 int32_t truncate_to_generation(const int32_t id, const size_t generation) {
46  return id;
47  }
48  CHECK_GE(id, 0);
49  return static_cast<size_t>(id) >= generation ? StringDictionary::INVALID_STR_ID : id;
50 }
51 
53  const std::vector<std::string>& strings) const {
55  std::vector<int32_t> string_ids(strings.size());
56  getTransientBulkImpl(strings, string_ids.data(), true);
57  return string_ids;
58 }
59 
61  const std::vector<std::string>& strings) {
63  const size_t num_strings = strings.size();
64  std::vector<int32_t> string_ids(num_strings);
65  if (num_strings == 0) {
66  return string_ids;
67  }
68  // Since new strings added to a StringDictionaryProxy are not materialized in the
69  // proxy's underlying StringDictionary, we can use the fast parallel
70  // StringDictionary::getBulk method to fetch ids from the underlying dictionary (which
71  // will return StringDictionary::INVALID_STR_ID for strings that don't exist)
72 
73  // Don't need to be under lock here as the string ids for strings in the underlying
74  // materialized dictionary are immutable
75  const size_t num_strings_not_found =
76  string_dict_->getBulk(strings, string_ids.data(), generation_);
77  if (num_strings_not_found > 0) {
78  std::lock_guard<std::shared_mutex> write_lock(rw_mutex_);
79  for (size_t string_idx = 0; string_idx < num_strings; ++string_idx) {
80  if (string_ids[string_idx] == StringDictionary::INVALID_STR_ID) {
81  string_ids[string_idx] = getOrAddTransientUnlocked(strings[string_idx]);
82  }
83  }
84  }
85  return string_ids;
86 }
87 
88 template <typename String>
90  unsigned const new_index = transient_str_to_int_.size();
91  auto transient_id = transientIndexToId(new_index);
92  auto const emplaced = transient_str_to_int_.emplace(str, transient_id);
93  if (emplaced.second) { // (str, transient_id) was added to transient_str_to_int_.
94  transient_string_vec_.push_back(&emplaced.first->first);
95  } else { // str already exists in transient_str_to_int_. Return existing transient_id.
96  transient_id = emplaced.first->second;
97  }
98  return transient_id;
99 }
100 
101 int32_t StringDictionaryProxy::getOrAddTransient(const std::string& str) {
102  auto const string_id = getIdOfStringFromClient(str);
103  if (string_id != StringDictionary::INVALID_STR_ID) {
104  return string_id;
105  }
106  std::lock_guard<std::shared_mutex> write_lock(rw_mutex_);
107  return getOrAddTransientUnlocked(str);
108 }
109 
110 int32_t StringDictionaryProxy::getIdOfString(const std::string& str) const {
111  std::shared_lock<std::shared_mutex> read_lock(rw_mutex_);
112  auto const str_id = getIdOfStringFromClient(str);
113  if (str_id != StringDictionary::INVALID_STR_ID || transient_str_to_int_.empty()) {
114  return str_id;
115  }
116  auto it = transient_str_to_int_.find(str);
117  return it != transient_str_to_int_.end() ? it->second
119 }
120 
121 template <typename String>
122 int32_t StringDictionaryProxy::getIdOfStringFromClient(const String& str) const {
123  CHECK_GE(generation_, 0);
124  return truncate_to_generation(string_dict_->getIdOfString(str), generation_);
125 }
126 
127 int32_t StringDictionaryProxy::getIdOfStringNoGeneration(const std::string& str) const {
128  std::shared_lock<std::shared_mutex> read_lock(rw_mutex_);
129  auto str_id = string_dict_->getIdOfString(str);
130  if (str_id != StringDictionary::INVALID_STR_ID || transient_str_to_int_.empty()) {
131  return str_id;
132  }
133  auto it = transient_str_to_int_.find(str);
134  return it != transient_str_to_int_.end() ? it->second
136 }
137 
138 std::string StringDictionaryProxy::getString(int32_t string_id) const {
139  if (inline_int_null_value<int32_t>() == string_id) {
140  return "";
141  }
142  std::shared_lock<std::shared_mutex> read_lock(rw_mutex_);
143  return getStringUnlocked(string_id);
144 }
145 
146 std::string StringDictionaryProxy::getStringUnlocked(const int32_t string_id) const {
147  if (string_id >= 0 && storageEntryCount() > 0) {
148  return string_dict_->getString(string_id);
149  }
150  unsigned const string_index = transientIdToIndex(string_id);
151  CHECK_LT(string_index, transient_string_vec_.size());
152  return *transient_string_vec_[string_index];
153 }
154 
155 std::vector<std::string> StringDictionaryProxy::getStrings(
156  const std::vector<int32_t>& string_ids) const {
157  std::vector<std::string> strings;
158  if (!string_ids.empty()) {
159  strings.reserve(string_ids.size());
160  for (const auto string_id : string_ids) {
161  if (string_id >= 0) {
162  strings.emplace_back(string_dict_->getString(string_id));
163  } else if (inline_int_null_value<int32_t>() == string_id) {
164  strings.emplace_back("");
165  } else {
166  unsigned const string_index = transientIdToIndex(string_id);
167  strings.emplace_back(*transient_string_vec_[string_index]);
168  }
169  }
170  }
171  return strings;
172 }
173 
174 template <typename String>
176  const String& lookup_string) const {
177  const auto it = transient_str_to_int_.find(lookup_string);
179  : it->second;
180 }
181 
184  const StringDictionaryProxy* dest_proxy,
185  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const {
186  auto timer = DEBUG_TIMER(__func__);
187  IdMap id_map = initIdMap();
188 
189  if (id_map.empty()) {
190  return id_map;
191  }
192 
193  const StringOps_Namespace::StringOps string_ops(string_op_infos);
194 
195  // First map transient strings, store at front of vector map
196  const size_t num_transient_entries = id_map.numTransients();
197  size_t num_transient_strings_not_translated = 0UL;
198  if (num_transient_entries) {
199  std::vector<std::string> transient_lookup_strings(num_transient_entries);
200  if (string_ops.size()) {
202  transient_string_vec_.cend(),
203  transient_lookup_strings.rbegin(),
204  [&](std::string const* ptr) { return string_ops(*ptr); });
205  } else {
207  transient_string_vec_.cend(),
208  transient_lookup_strings.rbegin(),
209  [](std::string const* ptr) { return *ptr; });
210  }
211 
212  // This lookup may have a different snapshot of
213  // dest_proxy transients and dictionary than what happens under
214  // the below dest_proxy_read_lock. We may need an unlocked version of
215  // getTransientBulk to ensure consistency (I don't believe
216  // current behavior would cause crashes/races, verify this though)
217 
218  // Todo(mattp): Consider variant of getTransientBulkImp that can take
219  // a vector of pointer-to-strings so we don't have to materialize
220  // transient_string_vec_ into transient_lookup_strings.
221 
222  num_transient_strings_not_translated =
223  dest_proxy->getTransientBulkImpl(transient_lookup_strings, id_map.data(), false);
224  }
225 
226  // Now map strings in dictionary
227  // We place non-transient strings after the transient strings
228  // if they exist, otherwise at index 0
229  int32_t* translation_map_stored_entries_ptr = id_map.storageData();
230 
231  auto dest_transient_lookup_callback = [dest_proxy, translation_map_stored_entries_ptr](
232  const std::string_view& source_string,
233  const int32_t source_string_id) {
234  translation_map_stored_entries_ptr[source_string_id] =
235  dest_proxy->lookupTransientStringUnlocked(source_string);
236  return translation_map_stored_entries_ptr[source_string_id] ==
238  };
239 
240  const size_t num_dest_transients = dest_proxy->transientEntryCountUnlocked();
241  const size_t num_persisted_strings_not_translated =
242  generation_ > 0 ? string_dict_->buildDictionaryTranslationMap(
243  dest_proxy->string_dict_.get(),
244  translation_map_stored_entries_ptr,
245  generation_,
246  dest_proxy->generation_,
247  num_dest_transients > 0UL,
248  dest_transient_lookup_callback,
249  string_op_infos)
250  : 0UL;
251 
252  const size_t num_dest_entries = dest_proxy->entryCountUnlocked();
253  const size_t num_total_entries =
254  id_map.getVectorMap().size() - 1UL /* account for skipped entry -1 */;
255  CHECK_GT(num_total_entries, 0UL);
256  const size_t num_strings_not_translated =
257  num_transient_strings_not_translated + num_persisted_strings_not_translated;
258  CHECK_LE(num_strings_not_translated, num_total_entries);
259  id_map.setNumUntranslatedStrings(num_strings_not_translated);
260 
261  // Below is a conservative setting of range based on the size of the destination proxy,
262  // but probably not worth a scan over the data (or inline computation as we translate)
263  // to compute the actual ranges
264 
265  id_map.setRangeStart(
266  num_dest_transients > 0 ? -1 - static_cast<int32_t>(num_dest_transients) : 0);
267  id_map.setRangeEnd(dest_proxy->storageEntryCount());
268 
269  const size_t num_entries_translated = num_total_entries - num_strings_not_translated;
270  const float match_pct =
271  100.0 * static_cast<float>(num_entries_translated) / num_total_entries;
272  VLOG(1) << std::fixed << std::setprecision(2) << match_pct << "% ("
273  << num_entries_translated << " entries) from dictionary ("
274  << string_dict_->getDbId() << ", " << string_dict_->getDictId() << ") with "
275  << num_total_entries << " total entries ( " << num_transient_entries
276  << " literals)"
277  << " translated to dictionary (" << dest_proxy->string_dict_->getDbId() << ", "
278  << dest_proxy->string_dict_->getDictId() << ") with " << num_dest_entries
279  << " total entries (" << dest_proxy->transientEntryCountUnlocked()
280  << " literals).";
281 
282  return id_map;
283 }
284 
285 void order_translation_locks(const int32_t source_dict_id,
286  const int32_t dest_dict_id,
287  std::shared_lock<std::shared_mutex>& source_proxy_read_lock,
288  std::unique_lock<std::shared_mutex>& dest_proxy_write_lock) {
289  if (source_dict_id == dest_dict_id) {
290  // proxies are same, only take one write lock
291  dest_proxy_write_lock.lock();
292  } else if (source_dict_id < dest_dict_id) {
293  source_proxy_read_lock.lock();
294  dest_proxy_write_lock.lock();
295  } else {
296  dest_proxy_write_lock.lock();
297  source_proxy_read_lock.lock();
298  }
299 }
300 
303  const StringDictionaryProxy* dest_proxy,
304  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const {
305  const auto source_dict_id = getDictId();
306  const auto dest_dict_id = dest_proxy->getDictId();
307 
308  std::shared_lock<std::shared_mutex> source_proxy_read_lock(rw_mutex_, std::defer_lock);
309  std::unique_lock<std::shared_mutex> dest_proxy_write_lock(dest_proxy->rw_mutex_,
310  std::defer_lock);
312  source_dict_id, dest_dict_id, source_proxy_read_lock, dest_proxy_write_lock);
313  return buildIntersectionTranslationMapToOtherProxyUnlocked(dest_proxy, string_op_infos);
314 }
315 
317  StringDictionaryProxy* dest_proxy,
318  const std::vector<StringOps_Namespace::StringOpInfo>& string_op_infos) const {
319  auto timer = DEBUG_TIMER(__func__);
320 
321  const auto source_dict_id = getDictId();
322  const auto dest_dict_id = dest_proxy->getDictId();
323  std::shared_lock<std::shared_mutex> source_proxy_read_lock(rw_mutex_, std::defer_lock);
324  std::unique_lock<std::shared_mutex> dest_proxy_write_lock(dest_proxy->rw_mutex_,
325  std::defer_lock);
327  source_dict_id, dest_dict_id, source_proxy_read_lock, dest_proxy_write_lock);
328 
329  auto id_map =
330  buildIntersectionTranslationMapToOtherProxyUnlocked(dest_proxy, string_op_infos);
331  if (id_map.empty()) {
332  return id_map;
333  }
334  const auto num_untranslated_strings = id_map.numUntranslatedStrings();
335  if (num_untranslated_strings > 0) {
336  const size_t total_post_translation_dest_transients =
337  num_untranslated_strings + dest_proxy->transientEntryCountUnlocked();
338  constexpr size_t max_allowed_transients =
339  static_cast<size_t>(std::numeric_limits<int32_t>::max() -
340  2); /* -2 accounts for INVALID_STR_ID and NULL value */
341  if (total_post_translation_dest_transients > max_allowed_transients) {
342  throw std::runtime_error("Union translation to dictionary" +
343  std::to_string(getDictId()) + " would result in " +
344  std::to_string(total_post_translation_dest_transients) +
345  " transient entries, which is more than limit of " +
346  std::to_string(max_allowed_transients) + " transients.");
347  }
348  const int32_t map_domain_start = id_map.domainStart();
349  const int32_t map_domain_end = id_map.domainEnd();
350 
351  const StringOps_Namespace::StringOps string_ops(string_op_infos);
352  const bool has_string_ops = string_ops.size();
353 
354  // First iterate over transient strings and add to dest map
355  // Todo (todd): Add call to fetch string_views (local) or strings (distributed)
356  // for all non-translated ids to avoid string-by-string fetch
357 
358  for (int32_t source_string_id = map_domain_start; source_string_id < -1;
359  ++source_string_id) {
360  if (id_map[source_string_id] == StringDictionary::INVALID_STR_ID) {
361  const auto source_string = getStringUnlocked(source_string_id);
362  const auto dest_string_id = dest_proxy->getOrAddTransientUnlocked(
363  has_string_ops ? string_ops(source_string) : source_string);
364  id_map[source_string_id] = dest_string_id;
365  }
366  }
367  // Now iterate over stored strings
368  for (int32_t source_string_id = 0; source_string_id < map_domain_end;
369  ++source_string_id) {
370  if (id_map[source_string_id] == StringDictionary::INVALID_STR_ID) {
371  const auto source_string = string_dict_->getString(source_string_id);
372  const auto dest_string_id = dest_proxy->getOrAddTransientUnlocked(
373  has_string_ops ? string_ops(source_string) : source_string);
374  id_map[source_string_id] = dest_string_id;
375  }
376  }
377  }
378  // We may have added transients to the destination proxy, use this to update
379  // our id map range (used downstream for ExpressionRange)
380 
381  const size_t num_dest_transients = dest_proxy->transientEntryCountUnlocked();
382  id_map.setRangeStart(
383  num_dest_transients > 0 ? -1 - static_cast<int32_t>(num_dest_transients) : 0);
384  return id_map;
385 }
386 
387 namespace {
388 
389 bool is_like(const std::string& str,
390  const std::string& pattern,
391  const bool icase,
392  const bool is_simple,
393  const char escape) {
394  return icase
395  ? (is_simple ? string_ilike_simple(
396  str.c_str(), str.size(), pattern.c_str(), pattern.size())
397  : string_ilike(str.c_str(),
398  str.size(),
399  pattern.c_str(),
400  pattern.size(),
401  escape))
402  : (is_simple ? string_like_simple(
403  str.c_str(), str.size(), pattern.c_str(), pattern.size())
404  : string_like(str.c_str(),
405  str.size(),
406  pattern.c_str(),
407  pattern.size(),
408  escape));
409 }
410 
411 } // namespace
412 
413 std::vector<int32_t> StringDictionaryProxy::getLike(const std::string& pattern,
414  const bool icase,
415  const bool is_simple,
416  const char escape) const {
417  CHECK_GE(generation_, 0);
418  auto result = string_dict_->getLike(pattern, icase, is_simple, escape, generation_);
419  for (unsigned index = 0; index < transient_string_vec_.size(); ++index) {
420  if (is_like(*transient_string_vec_[index], pattern, icase, is_simple, escape)) {
421  result.push_back(transientIndexToId(index));
422  }
423  }
424  return result;
425 }
426 
427 namespace {
428 
429 bool do_compare(const std::string& str,
430  const std::string& pattern,
431  const std::string& comp_operator) {
432  int res = str.compare(pattern);
433  if (comp_operator == "<") {
434  return res < 0;
435  } else if (comp_operator == "<=") {
436  return res <= 0;
437  } else if (comp_operator == "=") {
438  return res == 0;
439  } else if (comp_operator == ">") {
440  return res > 0;
441  } else if (comp_operator == ">=") {
442  return res >= 0;
443  } else if (comp_operator == "<>") {
444  return res != 0;
445  }
446  throw std::runtime_error("unsupported string compare operator");
447 }
448 
449 } // namespace
450 
452  const std::string& pattern,
453  const std::string& comp_operator) const {
454  CHECK_GE(generation_, 0);
455  auto result = string_dict_->getCompare(pattern, comp_operator, generation_);
456  for (unsigned index = 0; index < transient_string_vec_.size(); ++index) {
457  if (do_compare(*transient_string_vec_[index], pattern, comp_operator)) {
458  result.push_back(transientIndexToId(index));
459  }
460  }
461  return result;
462 }
463 
464 namespace {
465 
466 bool is_regexp_like(const std::string& str,
467  const std::string& pattern,
468  const char escape) {
469  return regexp_like(str.c_str(), str.size(), pattern.c_str(), pattern.size(), escape);
470 }
471 
472 } // namespace
473 
474 std::vector<int32_t> StringDictionaryProxy::getRegexpLike(const std::string& pattern,
475  const char escape) const {
476  CHECK_GE(generation_, 0);
477  auto result = string_dict_->getRegexpLike(pattern, escape, generation_);
478  for (unsigned index = 0; index < transient_string_vec_.size(); ++index) {
479  if (is_regexp_like(*transient_string_vec_[index], pattern, escape)) {
480  result.push_back(transientIndexToId(index));
481  }
482  }
483  return result;
484 }
485 
486 int32_t StringDictionaryProxy::getOrAdd(const std::string& str) noexcept {
487  return string_dict_->getOrAdd(str);
488 }
489 
490 std::pair<const char*, size_t> StringDictionaryProxy::getStringBytes(
491  int32_t string_id) const noexcept {
492  if (string_id >= 0) {
493  return string_dict_.get()->getStringBytes(string_id);
494  }
495  unsigned const string_index = transientIdToIndex(string_id);
496  CHECK_LT(string_index, transient_string_vec_.size());
497  std::string const* const str_ptr = transient_string_vec_[string_index];
498  return {str_ptr->c_str(), str_ptr->size()};
499 }
500 
502  const size_t num_storage_entries{generation_ == -1 ? string_dict_->storageEntryCount()
503  : generation_};
504  CHECK_LE(num_storage_entries, static_cast<size_t>(std::numeric_limits<int32_t>::max()));
505  return num_storage_entries;
506 }
507 
509  // CHECK_LE(num_storage_entries,
510  // static_cast<size_t>(std::numeric_limits<int32_t>::max()));
511  const size_t num_transient_entries{transient_str_to_int_.size()};
512  CHECK_LE(num_transient_entries,
513  static_cast<size_t>(std::numeric_limits<int32_t>::max()) - 1);
514  return num_transient_entries;
515 }
516 
518  std::shared_lock<std::shared_mutex> read_lock(rw_mutex_);
520 }
521 
524 }
525 
527  std::shared_lock<std::shared_mutex> read_lock(rw_mutex_);
528  return entryCountUnlocked();
529 }
530 
531 // Iterate over transient strings, then non-transients.
533  StringDictionary::StringCallback& serial_callback) const {
534  constexpr int32_t max_transient_id = -2;
535  // Iterate over transient strings.
536  for (unsigned index = 0; index < transient_string_vec_.size(); ++index) {
537  std::string const& str = *transient_string_vec_[index];
538  int32_t const string_id = max_transient_id - index;
539  serial_callback(str, string_id);
540  }
541  // Iterate over non-transient strings.
542  string_dict_->eachStringSerially(generation_, serial_callback);
543 }
544 
545 // For each (string/_view,old_id) pair passed in:
546 // * Get the new_id based on sdp_'s dictionary, or add it as a transient.
547 // * The StringDictionary is local, so call the faster getUnlocked() method.
548 // * Store the old_id -> new_id translation into the id_map_.
552 
553  public:
555  : sdp_(sdp), id_map_(id_map) {}
556  void operator()(std::string const& str, int32_t const string_id) override {
557  operator()(std::string_view(str), string_id);
558  }
559  void operator()(std::string_view const sv, int32_t const old_id) override {
560  int32_t const new_id = sdp_->string_dict_->getUnlocked(sv);
561  id_map_[old_id] = new_id == StringDictionary::INVALID_STR_ID
563  : new_id;
564  }
565 };
566 
567 // For each (string,old_id) pair passed in:
568 // * Get the new_id based on sdp_'s dictionary, or add it as a transient.
569 // * The StringDictionary is not local, so call string_dict_->makeLambdaStringToId()
570 // to make a lookup hash.
571 // * Store the old_id -> new_id translation into the id_map_.
575  using Lambda = std::function<int32_t(std::string const&)>;
577 
578  public:
580  : sdp_(sdp)
581  , id_map_(id_map)
582  , string_to_id_(sdp->string_dict_->makeLambdaStringToId()) {}
583  void operator()(std::string const& str, int32_t const old_id) override {
584  int32_t const new_id = string_to_id_(str);
585  id_map_[old_id] = new_id == StringDictionary::INVALID_STR_ID
587  : new_id;
588  }
589  void operator()(std::string_view const, int32_t const string_id) override {
590  UNREACHABLE() << "StringNetworkCallback requires a std::string.";
591  }
592 };
593 
594 // Union strings from both StringDictionaryProxies into *this as transients.
595 // Return id_map: sdp_rhs:string_id -> this:string_id for each string in sdp_rhs.
597  StringDictionaryProxy const& sdp_rhs) {
598  IdMap id_map = sdp_rhs.initIdMap();
599  // serial_callback cannot be parallelized due to calling getOrAddTransientUnlocked().
600  std::unique_ptr<StringDictionary::StringCallback> serial_callback;
601  if (string_dict_->isClient()) {
602  serial_callback = std::make_unique<StringNetworkCallback>(this, id_map);
603  } else {
604  serial_callback = std::make_unique<StringLocalCallback>(this, id_map);
605  }
606  // Import all non-duplicate strings (transient and non-transient) and add to id_map.
607  sdp_rhs.eachStringSerially(*serial_callback);
608  return id_map;
609 }
610 
611 std::ostream& operator<<(std::ostream& os, StringDictionaryProxy::IdMap const& id_map) {
612  return os << "IdMap(offset_(" << id_map.offset_ << ") vector_map_"
613  << shared::printContainer(id_map.vector_map_) << ')';
614 }
615 
616 void StringDictionaryProxy::updateGeneration(const int64_t generation) noexcept {
617  if (generation == -1) {
618  return;
619  }
620  if (generation_ != -1) {
621  CHECK_EQ(generation_, generation);
622  return;
623  }
624  generation_ = generation;
625 }
626 
628  const std::vector<std::string>& strings,
629  int32_t* string_ids,
630  const bool take_read_lock) const {
631  const size_t num_strings = strings.size();
632  if (num_strings == 0) {
633  return 0UL;
634  }
635  // StringDictionary::getBulk returns the number of strings not found
636  if (string_dict_->getBulk(strings, string_ids, generation_) == 0UL) {
637  return 0UL;
638  }
639 
640  // If here, dictionary could not find at least 1 target string,
641  // now look these up in the transient dictionary
642  // transientLookupBulk returns the number of strings not found
643  return transientLookupBulk(strings, string_ids, take_read_lock);
644 }
645 
646 template <typename String>
648  const std::vector<String>& lookup_strings,
649  int32_t* string_ids,
650  const bool take_read_lock) const {
651  const size_t num_strings = lookup_strings.size();
652  auto read_lock = take_read_lock ? std::shared_lock<std::shared_mutex>(rw_mutex_)
653  : std::shared_lock<std::shared_mutex>();
654 
655  if (num_strings == static_cast<size_t>(0) || transient_str_to_int_.empty()) {
656  return 0UL;
657  }
658  constexpr size_t tbb_parallel_threshold{20000};
659  if (num_strings < tbb_parallel_threshold) {
660  return transientLookupBulkUnlocked(lookup_strings, string_ids);
661  } else {
662  return transientLookupBulkParallelUnlocked(lookup_strings, string_ids);
663  }
664 }
665 
666 template <typename String>
668  const std::vector<String>& lookup_strings,
669  int32_t* string_ids) const {
670  const size_t num_strings = lookup_strings.size();
671  size_t num_strings_not_found = 0;
672  for (size_t string_idx = 0; string_idx < num_strings; ++string_idx) {
673  if (string_ids[string_idx] != StringDictionary::INVALID_STR_ID) {
674  continue;
675  }
676  // If we're here it means we need to look up this string as we don't
677  // have a valid id for it
678  string_ids[string_idx] = lookupTransientStringUnlocked(lookup_strings[string_idx]);
679  if (string_ids[string_idx] == StringDictionary::INVALID_STR_ID) {
680  num_strings_not_found++;
681  }
682  }
683  return num_strings_not_found;
684 }
685 
686 template <typename String>
688  const std::vector<String>& lookup_strings,
689  int32_t* string_ids) const {
690  const size_t num_lookup_strings = lookup_strings.size();
691  const size_t target_inputs_per_thread = 20000L;
692  ThreadInfo thread_info(
693  std::thread::hardware_concurrency(), num_lookup_strings, target_inputs_per_thread);
694  CHECK_GE(thread_info.num_threads, 1L);
695  CHECK_GE(thread_info.num_elems_per_thread, 1L);
696 
697  std::vector<size_t> num_strings_not_found_per_thread(thread_info.num_threads, 0UL);
698 
699  tbb::task_arena limited_arena(thread_info.num_threads);
700  limited_arena.execute([&] {
702  tbb::blocked_range<size_t>(
703  0, num_lookup_strings, thread_info.num_elems_per_thread /* tbb grain_size */),
704  [&](const tbb::blocked_range<size_t>& r) {
705  const size_t start_idx = r.begin();
706  const size_t end_idx = r.end();
707  size_t num_local_strings_not_found = 0;
708  for (size_t string_idx = start_idx; string_idx < end_idx; ++string_idx) {
709  if (string_ids[string_idx] != StringDictionary::INVALID_STR_ID) {
710  continue;
711  }
712  string_ids[string_idx] =
713  lookupTransientStringUnlocked(lookup_strings[string_idx]);
714  if (string_ids[string_idx] == StringDictionary::INVALID_STR_ID) {
715  num_local_strings_not_found++;
716  }
717  }
718  const size_t tbb_thread_idx = tbb::this_task_arena::current_thread_index();
719  num_strings_not_found_per_thread[tbb_thread_idx] = num_local_strings_not_found;
720  },
721  tbb::simple_partitioner());
722  });
723  size_t num_strings_not_found = 0;
724  for (int64_t thread_idx = 0; thread_idx < thread_info.num_threads; ++thread_idx) {
725  num_strings_not_found += num_strings_not_found_per_thread[thread_idx];
726  }
727  return num_strings_not_found;
728 }
729 
731  return string_dict_.get();
732 }
733 
734 int64_t StringDictionaryProxy::getGeneration() const noexcept {
735  return generation_;
736 }
737 
739  return string_dict_id_ == rhs.string_dict_id_ &&
741 }
742 
744  return !operator==(rhs);
745 }
void eachStringSerially(StringDictionary::StringCallback &) const
#define CHECK_EQ(x, y)
Definition: Logger.h:231
std::pair< const char *, size_t > getStringBytes(int32_t string_id) const noexcept
std::vector< int32_t > getLike(const std::string &pattern, const bool icase, const bool is_simple, const char escape) const
size_t transientEntryCountUnlocked() const
StringLocalCallback(StringDictionaryProxy *sdp, StringDictionaryProxy::IdMap &id_map)
int64_t num_elems_per_thread
Definition: ThreadInfo.h:23
StringDictionaryProxy::IdMap & id_map_
size_t entryCount() const
Returns the number of total string entries for this proxy, both stored in the underlying dictionary a...
int32_t getIdOfStringNoGeneration(const std::string &str) const
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
std::vector< int32_t > vector_map_
std::function< int32_t(std::string const &)> Lambda
std::string getStringUnlocked(const int32_t string_id) const
size_t storageEntryCount() const
Returns the number of string entries in the underlying string dictionary, at this proxy&#39;s generation_...
#define UNREACHABLE()
Definition: Logger.h:267
StringDictionary * getDictionary() const noexcept
#define CHECK_GE(x, y)
Definition: Logger.h:236
size_t transientLookupBulkUnlocked(const std::vector< String > &lookup_strings, int32_t *string_ids) const
StringDictionaryProxy * sdp_
void operator()(std::string const &str, int32_t const string_id) override
size_t transientLookupBulk(const std::vector< String > &lookup_strings, int32_t *string_ids, const bool take_read_lock) const
std::string getString(int32_t string_id) const
void setNumUntranslatedStrings(const size_t num_untranslated_strings)
Constants for Builtin SQL Types supported by OmniSci.
IdMap buildIntersectionTranslationMapToOtherProxyUnlocked(const StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos) const
size_t transientLookupBulkParallelUnlocked(const std::vector< String > &lookup_strings, int32_t *string_ids) const
#define CHECK_GT(x, y)
Definition: Logger.h:235
int32_t getIdOfStringFromClient(String const &) const
std::vector< int32_t > getTransientBulk(const std::vector< std::string > &strings) const
Executes read-only lookup of a vector of strings and returns a vector of their integer ids...
std::string to_string(char const *&&v)
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator) const
bool is_regexp_like(const std::string &str, const std::string &pattern, const char escape)
StringNetworkCallback(StringDictionaryProxy *sdp, StringDictionaryProxy::IdMap &id_map)
static constexpr int32_t INVALID_STR_ID
std::shared_ptr< StringDictionary > string_dict_
int64_t num_threads
Definition: ThreadInfo.h:22
IdMap transientUnion(StringDictionaryProxy const &)
std::vector< std::string const * > transient_string_vec_
void order_translation_locks(const int32_t source_db_id, const int32_t source_dict_id, const int32_t dest_db_id, const int32_t dest_dict_id, std::shared_lock< std::shared_mutex > &source_read_lock, std::shared_lock< std::shared_mutex > &dest_read_lock)
RUNTIME_EXPORT DEVICE bool string_like(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len, const char escape_char)
Definition: StringLike.cpp:246
void operator()(std::string const &str, int32_t const old_id) override
int32_t lookupTransientStringUnlocked(const String &lookup_string) const
void setRangeEnd(const int32_t range_end)
std::vector< std::string > getStrings(const std::vector< int32_t > &string_ids) const
size_t getTransientBulkImpl(const std::vector< std::string > &strings, int32_t *string_ids, const bool take_read_lock) const
RUNTIME_EXPORT DEVICE bool string_like_simple(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len)
Definition: StringLike.cpp:43
bool is_like(const std::string &str, const std::string &pattern, const bool icase, const bool is_simple, const char escape)
void operator()(std::string_view const sv, int32_t const old_id) override
static int32_t transientIndexToId(unsigned const index)
void updateGeneration(const int64_t generation) noexcept
size_t transientEntryCount() const
Returns the number of transient string entries for this proxy,.
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:297
Functions to support the LIKE and ILIKE operator in SQL. Only single-byte character set is supported ...
IdMap buildUnionTranslationMapToOtherProxy(StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_types) const
StringDictionaryProxy(StringDictionaryProxy const &)=delete
std::vector< int32_t > const & getVectorMap() const
#define CHECK_LT(x, y)
Definition: Logger.h:233
void operator()(std::string_view const, int32_t const string_id) override
bool do_compare(const std::string &str, const std::string &pattern, const std::string &comp_operator)
#define CHECK_LE(x, y)
Definition: Logger.h:234
StringDictionaryProxy * sdp_
int32_t getOrAddTransientUnlocked(String const &)
bool operator!=(StringDictionaryProxy const &) const
std::vector< int32_t > getRegexpLike(const std::string &pattern, const char escape) const
int32_t getOrAdd(const std::string &str) noexcept
RUNTIME_EXPORT DEVICE bool string_ilike_simple(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len)
Definition: StringLike.cpp:59
bool operator==(StringDictionaryProxy const &) const
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
int32_t getDictId() const noexcept
mapd_shared_lock< mapd_shared_mutex > read_lock
std::vector< int32_t > getOrAddTransientBulk(const std::vector< std::string > &strings)
IdMap buildIntersectionTranslationMapToOtherProxy(const StringDictionaryProxy *dest_proxy, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos) const
Builds a vectorized string_id translation map from this proxy to dest_proxy.
#define DEBUG_TIMER(name)
Definition: Logger.h:370
int32_t getOrAddTransient(const std::string &str)
void setRangeStart(const int32_t range_start)
mapd_unique_lock< mapd_shared_mutex > write_lock
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:108
RUNTIME_EXPORT DEVICE bool regexp_like(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len, const char escape_char)
Definition: Regexp.cpp:41
int32_t getIdOfString(const std::string &str) const
static unsigned transientIdToIndex(int32_t const id)
int64_t getGeneration() const noexcept
#define VLOG(n)
Definition: Logger.h:317
int32_t truncate_to_generation(const int32_t id, const size_t generation)
StringDictionaryProxy::IdMap & id_map_
RUNTIME_EXPORT DEVICE bool string_ilike(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len, const char escape_char)
Definition: StringLike.cpp:257