19 #include <tbb/parallel_for.h> 20 #include <boost/filesystem/operations.hpp> 21 #include <boost/filesystem/path.hpp> 22 #include <boost/sort/spreadsort/string_sort.hpp> 25 #include <string_view> 32 #include <sys/fcntl.h> 52 auto fd =
omnisci::open(path, O_RDWR | O_CREAT | (recover ? O_APPEND : O_TRUNC), 0644);
56 auto err = std::string(
"Dictionary path ") + std::string(path) +
57 std::string(
" does not exist.");
74 if (in == 0 || (in > (UINT32_MAX))) {
80 uint32_t
rk_hash(
const std::string_view& str) {
81 uint32_t str_hash = 1;
83 for (
size_t i = 0; i < str.size(); ++i) {
84 str_hash = str_hash * 997 + str[i];
98 const bool materializeHashes,
99 size_t initial_capacity)
101 , string_id_hash_table_(initial_capacity, INVALID_STR_ID)
102 , rk_hashes_(initial_capacity)
104 , materialize_hashes_(materializeHashes)
107 , offset_map_(nullptr)
108 , payload_map_(nullptr)
109 , offset_file_size_(0)
110 , payload_file_size_(0)
111 , payload_file_off_(0)
112 , strings_cache_(nullptr) {
113 if (!isTemp && folder.empty()) {
118 CHECK_EQ(
size_t(0), (initial_capacity & (initial_capacity - 1)));
120 boost::filesystem::path storage_path(folder);
121 offsets_path_ = (storage_path / boost::filesystem::path(
"DictOffsets")).
string();
122 const auto payload_path =
123 (storage_path / boost::filesystem::path(
"DictPayload")).
string();
129 bool storage_is_empty =
false;
131 storage_is_empty =
true;
147 const uint64_t str_count =
152 const uint64_t max_entries =
154 round_up_p2(std::max(initial_capacity, static_cast<size_t>(1))));
158 std::vector<uint32_t> new_rk_hashes(max_entries / 2);
163 if (str_count == 0) {
167 unsigned string_id = 0;
170 uint32_t thread_inits = 0;
171 const auto thread_count = std::thread::hardware_concurrency();
172 const uint32_t items_per_thread = std::max<uint32_t>(
173 2000, std::min<uint32_t>(200000, (str_count / thread_count) + 1));
174 std::vector<std::future<std::vector<std::pair<uint32_t, unsigned int>>>>
176 for (string_id = 0; string_id < str_count; string_id += items_per_thread) {
177 dictionary_futures.emplace_back(std::async(
178 std::launch::async, [string_id, str_count, items_per_thread,
this] {
179 std::vector<std::pair<uint32_t, unsigned int>> hashVec;
180 for (uint32_t curr_id = string_id;
181 curr_id < string_id + items_per_thread && curr_id < str_count;
184 if (recovered.canary) {
188 std::string temp(recovered.c_str_ptr, recovered.size);
189 hashVec.emplace_back(std::make_pair(
rk_hash(temp), temp.size()));
195 if (thread_inits % thread_count == 0) {
200 if (dictionary_futures.size() != 0) {
203 VLOG(1) <<
"Opened string dictionary " << folder <<
" # Strings: " <<
str_count_ 212 std::vector<std::future<std::vector<std::pair<uint32_t, unsigned int>>>>&
213 dictionary_futures) {
214 for (
auto& dictionary_future : dictionary_futures) {
215 dictionary_future.wait();
216 auto hashVec = dictionary_future.get();
217 for (
auto& hash : hashVec) {
227 dictionary_futures.clear();
239 if (storage_slots == 0) {
244 int64_t min_bound = 0;
245 int64_t max_bound = storage_slots - 1;
247 while (min_bound <= max_bound) {
248 guess = (max_bound + min_bound) / 2;
251 max_bound = guess - 1;
253 min_bound = guess + 1;
256 CHECK_GE(guess + (min_bound > guess ? 1 : 0), 0);
257 return guess + (min_bound > guess ? 1 : 0);
289 std::vector<int32_t> string_ids;
290 client_->get_or_add_bulk(string_ids, std::vector<std::string>{str});
291 CHECK_EQ(
size_t(1), string_ids.size());
292 return string_ids.front();
301 LOG(
ERROR) <<
"Could not encode string: " << str
302 <<
", the encoded value doesn't fit in " <<
sizeof(
T) * 8
303 <<
" bits. Will store NULL instead.";
308 template <
class String>
310 const std::vector<std::vector<String>>& string_array_vec,
311 std::vector<std::vector<int32_t>>& ids_array_vec) {
312 ids_array_vec.resize(string_array_vec.size());
313 for (
size_t i = 0; i < string_array_vec.size(); i++) {
314 auto& strings = string_array_vec[i];
315 auto& ids = ids_array_vec[i];
316 ids.resize(strings.size());
322 const std::vector<std::vector<std::string>>& string_array_vec,
323 std::vector<std::vector<int32_t>>& ids_array_vec);
330 template <
class String>
332 std::vector<uint32_t>& hashes)
const noexcept {
333 CHECK_EQ(string_vec.size(), hashes.size());
335 tbb::parallel_for(tbb::blocked_range<size_t>(0, string_vec.size()),
336 [&string_vec, &hashes](
const tbb::blocked_range<size_t>& r) {
337 for (
size_t curr_id = r.begin(); curr_id != r.end(); ++curr_id) {
338 if (string_vec[curr_id].empty()) {
341 hashes[curr_id] =
rk_hash(string_vec[curr_id]);
346 template <
class T,
class String>
348 T* output_string_ids) {
361 for (
const auto& str : input_strings) {
363 output_string_ids[out_idx++] = inline_int_null_value<T>();
368 const uint32_t hash =
rk_hash(str);
376 if (
str_count_ == static_cast<size_t>(max_valid_int_value<T>())) {
377 log_encoding_error<T>(str);
378 output_string_ids[out_idx++] = inline_int_null_value<T>();
384 <<
") of Dictionary encoded Strings reached for this column, offset path " 405 template <
class T,
class String>
407 T* output_string_ids) {
414 std::vector<uint32_t> input_strings_rk_hashes(input_strings.size());
415 hashStrings(input_strings, input_strings_rk_hashes);
418 size_t shadow_str_count =
420 const size_t storage_high_water_mark = shadow_str_count;
421 std::vector<size_t> string_memory_ids;
422 size_t sum_new_string_lengths = 0;
423 string_memory_ids.reserve(input_strings.size());
424 size_t input_string_idx{0};
425 for (
const auto& input_string : input_strings) {
427 if (input_string.empty()) {
428 output_string_ids[input_string_idx++] = inline_int_null_value<T>();
439 input_strings_rk_hashes);
442 const uint32_t input_string_rk_hash = input_strings_rk_hashes[input_string_idx];
447 storage_high_water_mark,
460 if (shadow_str_count == static_cast<size_t>(max_valid_int_value<T>())) {
461 log_encoding_error<T>(input_string);
462 output_string_ids[input_string_idx++] = inline_int_null_value<T>();
466 <<
"Maximum number (" << shadow_str_count
467 <<
") of Dictionary encoded Strings reached for this column, offset path " 471 string_memory_ids.push_back(input_string_idx);
472 sum_new_string_lengths += input_string.size();
475 rk_hashes_[shadow_str_count] = input_string_rk_hash;
477 output_string_ids[input_string_idx++] = shadow_str_count++;
485 uint8_t* encoded_vec);
487 uint16_t* encoded_vec);
489 int32_t* encoded_vec);
492 const std::vector<std::string_view>& string_vec,
493 uint8_t* encoded_vec);
495 const std::vector<std::string_view>& string_vec,
496 uint16_t* encoded_vec);
498 const std::vector<std::string_view>& string_vec,
499 int32_t* encoded_vec);
501 template <
class T,
class String>
505 std::vector<int32_t> string_ids;
508 for (
size_t i = 0; i < string_ids.size(); ++i) {
509 const auto string_id = string_ids[i];
510 const bool invalid = string_id > max_valid_int_value<T>();
511 if (invalid || string_id == inline_int_null_value<int32_t>()) {
513 log_encoding_error<T>(string_vec[i]);
515 encoded_vec[out_idx++] = inline_int_null_value<T>();
518 encoded_vec[out_idx++] = string_id;
523 const std::vector<std::string>& string_vec,
524 uint8_t* encoded_vec);
526 const std::vector<std::string>& string_vec,
527 uint16_t* encoded_vec);
529 const std::vector<std::string>& string_vec,
530 int32_t* encoded_vec);
533 const std::vector<std::string_view>& string_vec,
534 uint8_t* encoded_vec);
536 const std::vector<std::string_view>& string_vec,
537 uint16_t* encoded_vec);
539 const std::vector<std::string_view>& string_vec,
540 int32_t* encoded_vec);
551 const uint32_t hash =
rk_hash(str);
560 client_->get_string(ret, string_id);
583 return client_->storage_entry_count();
591 const std::string& pattern,
593 const bool is_simple,
597 str.c_str(), str.size(), pattern.c_str(), pattern.size())
604 str.c_str(), str.size(), pattern.c_str(), pattern.size())
616 const bool is_simple,
618 const size_t generation)
const {
621 return client_->get_like(pattern, icase, is_simple, escape, generation);
623 const auto cache_key = std::make_tuple(pattern, icase, is_simple, escape);
628 std::vector<int32_t>
result;
629 std::vector<std::thread> workers;
632 std::vector<std::vector<int32_t>> worker_results(worker_count);
634 for (
int worker_idx = 0; worker_idx < worker_count; ++worker_idx) {
635 workers.emplace_back([&worker_results,
644 for (
size_t string_id = worker_idx; string_id < generation;
645 string_id += worker_count) {
647 if (
is_like(str, pattern, icase, is_simple, escape)) {
648 worker_results[worker_idx].push_back(string_id);
653 for (
auto& worker : workers) {
656 for (
const auto& worker_result : worker_results) {
657 result.insert(result.end(), worker_result.begin(), worker_result.end());
660 const auto it_ok =
like_cache_.insert(std::make_pair(cache_key, result));
668 std::string comp_operator,
670 std::vector<int32_t>
result;
675 auto eq_id = eq_id_itr->second;
676 if (comp_operator ==
"=") {
677 result.push_back(eq_id);
679 for (int32_t idx = 0; idx <= cur_size; idx++) {
683 result.push_back(idx);
687 std::vector<std::thread> workers;
690 std::vector<std::vector<int32_t>> worker_results(worker_count);
692 for (
int worker_idx = 0; worker_idx < worker_count; ++worker_idx) {
693 workers.emplace_back(
694 [&worker_results, &pattern, generation, worker_idx, worker_count,
this]() {
695 for (
size_t string_id = worker_idx; string_id < generation;
696 string_id += worker_count) {
698 if (str == pattern) {
699 worker_results[worker_idx].push_back(string_id);
704 for (
auto& worker : workers) {
707 for (
const auto& worker_result : worker_results) {
708 result.insert(result.end(), worker_result.begin(), worker_result.end());
710 if (result.size() > 0) {
711 const auto it_ok =
equal_cache_.insert(std::make_pair(pattern, result[0]));
715 if (comp_operator ==
"<>") {
716 for (int32_t idx = 0; idx <= cur_size; idx++) {
720 result.push_back(idx);
728 const std::string& comp_operator,
729 const size_t generation) {
732 return client_->get_compare(pattern, comp_operator, generation);
734 std::vector<int32_t> ret;
739 if (comp_operator ==
"=" || comp_operator ==
"<>") {
740 return getEquals(pattern, comp_operator, generation);
748 cache_index = std::make_shared<StringDictionary::compare_cache_value_t>();
753 [
this](decltype(
sorted_cache)::value_type
const& a, decltype(pattern)& b) {
755 return string_lt(a_str.c_str_ptr, a_str.size, b.c_str(), b.size());
760 cache_index->diff = 1;
764 cache_str.c_str_ptr, cache_str.size, pattern.c_str(), pattern.size())) {
765 cache_index->index = cache_itr -
sorted_cache.begin() - 1;
766 cache_index->diff = 1;
769 cache_index->diff = 0;
790 if (comp_operator ==
"<") {
791 size_t idx = cache_index->index;
792 if (cache_index->diff) {
793 idx = cache_index->index + 1;
794 if (cache_index->index == 0 && cache_index->diff > 0) {
795 idx = cache_index->index;
798 for (
size_t i = 0; i < idx; i++) {
809 }
else if (comp_operator ==
"<=") {
810 size_t idx = cache_index->index + 1;
811 if (cache_index == 0 && cache_index->diff > 0) {
812 idx = cache_index->index;
814 for (
size_t i = 0; i < idx; i++) {
823 }
else if (comp_operator ==
">") {
824 size_t idx = cache_index->index + 1;
825 if (cache_index->index == 0 && cache_index->diff > 0) {
826 idx = cache_index->index;
838 }
else if (comp_operator ==
">=") {
839 size_t idx = cache_index->index;
840 if (cache_index->diff) {
841 idx = cache_index->index + 1;
842 if (cache_index->index == 0 && cache_index->diff > 0) {
843 idx = cache_index->index;
849 }
else if (comp_operator ==
"=") {
850 if (!cache_index->diff) {
856 }
else if (comp_operator ==
"<>") {
857 if (!cache_index->diff) {
858 size_t idx = cache_index->index;
859 for (
size_t i = 0; i < idx; i++) {
873 std::runtime_error(
"Unsupported string comparison operator");
881 const std::string& pattern,
883 return regexp_like(str.c_str(), str.size(), pattern.c_str(), pattern.size(), escape);
890 const size_t generation)
const {
893 return client_->get_regexp_like(pattern, escape, generation);
895 const auto cache_key = std::make_pair(pattern, escape);
900 std::vector<int32_t>
result;
901 std::vector<std::thread> workers;
904 std::vector<std::vector<int32_t>> worker_results(worker_count);
906 for (
int worker_idx = 0; worker_idx < worker_count; ++worker_idx) {
907 workers.emplace_back([&worker_results,
914 for (
size_t string_id = worker_idx; string_id < generation;
915 string_id += worker_count) {
918 worker_results[worker_idx].push_back(string_id);
923 for (
auto& worker : workers) {
926 for (
const auto& worker_result : worker_results) {
927 result.insert(result.end(), worker_result.begin(), worker_result.end());
929 const auto it_ok =
regex_cache_.insert(std::make_pair(cache_key, result));
939 throw std::runtime_error(
940 "copying dictionaries from remote server is not supported yet.");
949 const bool multithreaded =
str_count_ > 10000;
950 const auto worker_count =
951 multithreaded ?
static_cast<size_t>(
cpu_threads()) :
size_t(1);
953 std::vector<std::vector<std::string>> worker_results(worker_count);
954 auto copy = [
this](std::vector<std::string>& str_list,
955 const size_t start_id,
956 const size_t end_id) {
958 str_list.reserve(end_id - start_id);
959 for (
size_t string_id = start_id; string_id < end_id; ++string_id) {
964 std::vector<std::future<void>> workers;
965 const auto stride = (
str_count_ + (worker_count - 1)) / worker_count;
966 for (
size_t worker_idx = 0, start = 0,
end = std::min(start + stride,
str_count_);
967 worker_idx < worker_count && start <
str_count_;
968 ++worker_idx, start += stride,
end = std::min(start + stride, str_count_)) {
969 workers.push_back(std::async(
970 std::launch::async,
copy, std::ref(worker_results[worker_idx]), start, end));
972 for (
auto& worker : workers) {
976 CHECK_EQ(worker_results.size(), size_t(1));
980 for (
const auto& worker_result : worker_results) {
982 strings_cache_->end(), worker_result.begin(), worker_result.end());
999 new_str_ids[bucket] = string_id_hash_table_[i];
1006 const uint32_t hash =
rk_hash(str);
1008 new_str_ids[bucket] = i;
1014 template <
class String>
1016 const size_t storage_high_water_mark,
1017 const std::vector<String>& input_strings,
1018 const std::vector<size_t>& string_memory_ids,
1019 const std::vector<uint32_t>& input_strings_rk_hashes) noexcept {
1026 new_str_ids[bucket] = string_id_hash_table_[i];
1031 for (
size_t storage_idx = 0; storage_idx != storage_high_water_mark; ++storage_idx) {
1033 const uint32_t hash =
rk_hash(storage_string);
1035 new_str_ids[bucket] = storage_idx;
1037 for (
size_t memory_idx = 0; memory_idx != string_memory_ids.size(); ++memory_idx) {
1038 size_t string_memory_id = string_memory_ids[memory_idx];
1040 input_strings_rk_hashes[string_memory_id], new_str_ids);
1041 new_str_ids[bucket] = storage_high_water_mark + memory_idx;
1049 if (str.size() == 0) {
1050 return inline_int_null_value<int32_t>();
1054 const uint32_t hash =
rk_hash(str);
1069 <<
") of Dictionary encoded Strings reached for this column, offset path " 1090 CHECK(!str_canary.canary);
1091 return std::string(str_canary.c_str_ptr, str_canary.size);
1095 const int string_id)
const noexcept {
1097 CHECK(!str_canary.canary);
1098 return std::make_pair(str_canary.c_str_ptr, str_canary.size);
1101 template <
class String>
1104 const std::vector<int32_t>& data)
const 1106 auto bucket = hash & (data.size() - 1);
1108 const int32_t candidate_string_id = data[bucket];
1109 if (candidate_string_id ==
1116 if (str.size() == old_str.size() &&
1117 !memcmp(str.data(), old_str.data(), str.size())) {
1123 if (++bucket == data.size()) {
1130 template <
class String>
1132 const uint32_t input_string_rk_hash,
1133 const String& input_string,
1134 const std::vector<int32_t>& string_id_hash_table,
1135 const size_t storage_high_water_mark,
1136 const std::vector<String>& input_strings,
1137 const std::vector<size_t>& string_memory_ids)
const noexcept {
1138 auto bucket = input_string_rk_hash & (string_id_hash_table.size() - 1);
1140 const int32_t candidate_string_id = string_id_hash_table[bucket];
1141 if (candidate_string_id ==
1146 (input_string_rk_hash ==
rk_hashes_[candidate_string_id])) {
1147 if (candidate_string_id > 0 &&
1148 static_cast<size_t>(candidate_string_id) >= storage_high_water_mark) {
1151 size_t memory_offset =
1152 static_cast<size_t>(candidate_string_id - storage_high_water_mark);
1153 const String candidate_string = input_strings[string_memory_ids[memory_offset]];
1154 if (input_string.size() == candidate_string.size() &&
1155 !memcmp(input_string.data(), candidate_string.data(), input_string.size())) {
1161 const auto candidate_storage_string =
1163 if (input_string.size() == candidate_storage_string.size() &&
1164 !memcmp(input_string.data(),
1165 candidate_storage_string.data(),
1166 input_string.size())) {
1174 if (++bucket == string_id_hash_table.size()) {
1182 const uint32_t hash,
1183 const std::vector<int32_t>& data) noexcept {
1184 auto bucket = hash & (data.size() - 1);
1192 if (++bucket == data.size()) {
1200 const size_t write_length) {
1202 const size_t min_capacity_needed =
1219 const size_t write_length) {
1222 const size_t min_capacity_needed =
1238 template <
class String>
1252 template <
class String>
1254 const std::vector<String>& input_strings,
1255 const std::vector<size_t>& string_memory_ids,
1256 const size_t sum_new_strings_lengths) noexcept {
1257 const size_t num_strings = string_memory_ids.size();
1262 for (
size_t i = 0; i < num_strings; ++i) {
1263 const size_t string_idx = string_memory_ids[i];
1264 const String str = input_strings[string_idx];
1265 const size_t str_size(str.size());
1280 const int string_id)
const noexcept {
1287 if (str_meta->
size == 0xffff) {
1289 return {
nullptr, 0,
true};
1314 const size_t min_capacity_requested) noexcept {
1315 const size_t canary_buff_size_to_add =
1326 CHECK_NE(lseek(fd, 0, SEEK_END), -1);
1328 CHECK(write_return > 0 &&
1329 (static_cast<size_t>(write_return) == canary_buff_size_to_add));
1330 return canary_buff_size_to_add;
1335 const size_t min_capacity_requested) noexcept {
1336 const size_t canary_buff_size_to_add =
1341 reinterpret_cast<char*
>(realloc(
CANARY_BUFFER, canary_buff_size_to_add));
1346 void* new_addr = realloc(addr, mem_size + canary_buff_size_to_add);
1348 void* write_addr =
reinterpret_cast<void*
>(
static_cast<char*
>(new_addr) + mem_size);
1350 mem_size += canary_buff_size_to_add;
1389 std::vector<int32_t> temp_sorted_cache;
1390 for (
size_t i = cur_cache_size; i <
str_count_; i++) {
1391 temp_sorted_cache.push_back(i);
1403 std::sort(cache.begin(), cache.end(), [
this](int32_t a, int32_t b) {
1406 return string_lt(a_str.c_str_ptr, a_str.size, b_str.c_str_ptr, b_str.size);
1412 std::vector<int32_t> updated_cache(temp_sorted_cache.size() +
sorted_cache.size());
1413 size_t t_idx = 0, s_idx = 0, idx = 0;
1414 for (; t_idx < temp_sorted_cache.size() && s_idx <
sorted_cache.size(); idx++) {
1417 const auto insert_from_temp_cache =
1418 string_lt(t_string.c_str_ptr, t_string.size, s_string.c_str_ptr, s_string.size);
1419 if (insert_from_temp_cache) {
1420 updated_cache[idx] = temp_sorted_cache[t_idx++];
1425 while (t_idx < temp_sorted_cache.size()) {
1426 updated_cache[idx++] = temp_sorted_cache[t_idx++];
1435 std::vector<int32_t>& dest_ids,
1437 const std::vector<int32_t>& source_ids,
1439 const std::map<int32_t, std::string> transient_mapping) {
1440 std::vector<std::string> strings;
1442 for (
const int32_t source_id : source_ids) {
1443 if (source_id == std::numeric_limits<int32_t>::min()) {
1444 strings.emplace_back(
"");
1445 }
else if (source_id < 0) {
1446 if (
auto string_itr = transient_mapping.find(source_id);
1447 string_itr != transient_mapping.end()) {
1448 strings.emplace_back(string_itr->second);
1450 throw std::runtime_error(
"Unexpected negative source ID");
1453 strings.push_back(source_dict->
getString(source_id));
1457 dest_ids.resize(strings.size());
1462 std::vector<std::vector<int32_t>>& dest_array_ids,
1464 const std::vector<std::vector<int32_t>>& source_array_ids,
1466 dest_array_ids.resize(source_array_ids.size());
1468 std::atomic<size_t> row_idx{0};
1469 auto processor = [&row_idx, &dest_array_ids, dest_dict, &source_array_ids, source_dict](
1472 auto row = row_idx.fetch_add(1);
1474 if (row >= dest_array_ids.size()) {
1477 const auto& source_ids = source_array_ids[row];
1478 auto& dest_ids = dest_array_ids[row];
1483 const int num_worker_threads = std::thread::hardware_concurrency();
1485 if (source_array_ids.size() / num_worker_threads > 10) {
1486 std::vector<std::future<void>> worker_threads;
1487 for (
int i = 0; i < num_worker_threads; ++i) {
1488 worker_threads.push_back(std::async(std::launch::async, processor, i));
1491 for (
auto& child : worker_threads) {
1494 for (
auto& child : worker_threads) {
1505 const std::vector<int32_t>& source_ids,
1506 const DictRef source_dict_ref,
1507 const int32_t dest_generation) {
1508 DictRef temp_dict_ref(-1, -1);
1511 dest_ids, dest_dict_ref, source_ids, source_dict_ref, dest_generation);
StringIdxEntry * offset_map_
void translate_string_ids(std::vector< int32_t > &dest_ids, const DictRef dest_dict_ref, const std::vector< int32_t > &source_ids, const DictRef source_dict_ref, const int32_t dest_generation)
size_t payload_file_size_
int open(const char *path, int flags, int mode)
void checkAndConditionallyIncreasePayloadCapacity(const size_t write_length)
uint32_t rk_hash(const std::string_view &str)
void * checked_mmap(const int fd, const size_t sz)
int32_t getIdOfString(const std::string &str) const
size_t addStorageCapacity(int fd, const size_t min_capacity_requested=0) noexcept
DEVICE void sort(ARGS &&... args)
void hashStrings(const std::vector< String > &string_vec, std::vector< uint32_t > &hashes) const noexcept
std::vector< int32_t > getRegexpLike(const std::string &pattern, const char escape, const size_t generation) const
std::pair< char *, size_t > getStringBytesChecked(const int string_id) const noexcept
void addOffsetCapacity(const size_t min_capacity_requested=0) noexcept
std::string getStringChecked(const int string_id) const noexcept
size_t canary_buffer_size
std::string getString(int32_t string_id) const
DictionaryCache< std::string, compare_cache_value_t > compare_cache_
bool fillRateIsHigh(const size_t num_strings) const noexcept
void * addMemoryCapacity(void *addr, size_t &mem_size, const size_t min_capacity_requested=0) noexcept
DEVICE bool string_eq(const char *lhs, const int32_t lhs_len, const char *rhs, const int32_t rhs_len)
static void populate_string_ids(std::vector< int32_t > &dest_ids, StringDictionary *dest_dict, const std::vector< int32_t > &source_ids, const StringDictionary *source_dict, const std::map< int32_t, std::string > transient_mapping={})
Populates provided dest_ids vector with string ids corresponding to given source strings.
Constants for Builtin SQL Types supported by OmniSci.
std::string offsets_path_
std::string_view getStringFromStorageFast(const int string_id) const noexcept
int32_t getOrAddImpl(const std::string &str) noexcept
int32_t getOrAdd(const std::string &str) noexcept
int32_t getUnlocked(const std::string &str) const noexcept
std::map< std::pair< std::string, char >, std::vector< int32_t > > regex_cache_
size_t storageEntryCount() const
void increaseCapacity() noexcept
std::unique_ptr< StringDictionaryClient > client_
std::string getStringUnlocked(int32_t string_id) const noexcept
DEVICE auto copy(ARGS &&... args)
StringDictionary(const std::string &folder, const bool isTemp, const bool recover, const bool materializeHashes=false, size_t initial_capacity=256)
bool is_regexp_like(const std::string &str, const std::string &pattern, const char escape)
DEVICE auto lower_bound(ARGS &&... args)
static constexpr size_t MAX_STRCOUNT
std::vector< int32_t > getEquals(std::string pattern, std::string comp_operator, size_t generation)
DEVICE bool string_lt(const char *lhs, const int32_t lhs_len, const char *rhs, const int32_t rhs_len)
void log_encoding_error(std::string_view str)
static constexpr int32_t INVALID_STR_ID
std::shared_ptr< std::vector< std::string > > strings_cache_
DEVICE bool regexp_like(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len, const char escape_char)
DEVICE bool string_ilike(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len, const char escape_char)
std::vector< int32_t > getCompare(const std::string &pattern, const std::string &comp_operator, const size_t generation)
std::vector< int32_t > getLike(const std::string &pattern, const bool icase, const bool is_simple, const char escape, const size_t generation) const
mapd_shared_mutex rw_mutex_
DEVICE bool string_like(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len, const char escape_char)
uint32_t computeUniqueBucketWithHash(const uint32_t hash, const std::vector< int32_t > &data) noexcept
void appendToStorageBulk(const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids, const size_t sum_new_strings_lengths) noexcept
void addPayloadCapacity(const size_t min_capacity_requested=0) noexcept
bool is_like(const std::string &str, const std::string &pattern, const bool icase, const bool is_simple, const char escape)
std::map< std::string, int32_t > equal_cache_
void getOrAddBulkArray(const std::vector< std::vector< String >> &string_array_vec, std::vector< std::vector< int32_t >> &ids_array_vec)
void translate_string_ids(std::vector< int32_t > &dest_ids, const LeafHostInfo &dict_server_host, const DictRef dest_dict_ref, const std::vector< int32_t > &source_ids, const DictRef source_dict_ref, const int32_t dest_generation)
DEVICE bool string_ilike_simple(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len)
Functions to support the LIKE and ILIKE operator in SQL. Only single-byte character set is supported ...
int checked_open(const char *path, const bool recover)
void getOrAddBulk(const std::vector< String > &string_vec, T *encoded_vec)
std::pair< char *, size_t > getStringBytes(int32_t string_id) const noexcept
std::vector< int32_t > string_id_hash_table_
const int SYSTEM_PAGE_SIZE
bool checkpoint() noexcept
void mergeSortedCache(std::vector< int32_t > &temp_sorted_cache)
void increaseCapacityFromStorageAndMemory(const size_t storage_high_water_mark, const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids, const std::vector< uint32_t > &input_strings_rk_hashes) noexcept
size_t getNumStringsFromStorage(const size_t storage_slots) const noexcept
int msync(void *addr, size_t length, bool async)
void processDictionaryFutures(std::vector< std::future< std::vector< std::pair< uint32_t, unsigned int >>>> &dictionary_futures)
void checked_munmap(void *addr, size_t length)
std::shared_ptr< const std::vector< std::string > > copyStrings() const
std::unique_ptr< StringDictionaryClient > client_no_timeout_
mapd_shared_lock< mapd_shared_mutex > read_lock
void checkAndConditionallyIncreaseOffsetCapacity(const size_t write_length)
static void populate_string_array_ids(std::vector< std::vector< int32_t >> &dest_array_ids, StringDictionary *dest_dict, const std::vector< std::vector< int32_t >> &source_array_ids, const StringDictionary *source_dict)
void invalidateInvertedIndex() noexcept
size_t write(FILE *f, const size_t offset, const size_t size, int8_t *buf)
Writes the specified number of bytes to the offset position in file f from buf.
uint32_t computeBucket(const uint32_t hash, const String &str, const std::vector< int32_t > &data) const noexcept
const uint64_t round_up_p2(const uint64_t num)
DEVICE void swap(ARGS &&... args)
void sortCache(std::vector< int32_t > &cache)
static constexpr size_t MAX_STRLEN
void getOrAddBulkParallel(const std::vector< String > &string_vec, T *encoded_vec)
mapd_unique_lock< mapd_shared_mutex > write_lock
std::map< std::tuple< std::string, bool, bool, char >, std::vector< int32_t > > like_cache_
bool g_enable_stringdict_parallel
PayloadString getStringFromStorage(const int string_id) const noexcept
void getOrAddBulkRemote(const std::vector< String > &string_vec, T *encoded_vec)
DEVICE bool string_like_simple(const char *str, const int32_t str_len, const char *pattern, const int32_t pat_len)
uint32_t computeBucketFromStorageAndMemory(const uint32_t input_string_rk_hash, const String &input_string, const std::vector< int32_t > &string_id_hash_table, const size_t storage_high_water_mark, const std::vector< String > &input_strings, const std::vector< size_t > &string_memory_ids) const noexcept
std::vector< int32_t > sorted_cache
void appendToStorage(String str) noexcept
~StringDictionary() noexcept
std::vector< uint32_t > rk_hashes_
size_t file_size(const int fd)