38 std::make_unique<HashingSchemeRecycler>();
42 const std::shared_ptr<Analyzer::BinOper> condition,
43 const std::vector<InputTableInfo>& query_infos,
47 const int device_count,
53 decltype(std::chrono::steady_clock::now()) ts1, ts2;
57 <<
" for qual: " << condition->toString();
58 ts1 = std::chrono::steady_clock::now();
60 auto inner_outer_pairs =
62 const auto& inner_outer_cols = inner_outer_pairs.first;
63 const auto& col_pairs_string_op_infos = inner_outer_pairs.second;
64 auto join_hash_table = std::shared_ptr<BaselineJoinHashTable>(
72 col_pairs_string_op_infos,
75 hashtable_build_dag_map,
76 table_id_to_node_map));
78 join_hash_table->reify(preferred_hash_type);
81 join_hash_table->freeHashBufferMemory();
82 throw std::runtime_error(e.what());
86 join_hash_table->freeHashBufferMemory();
87 throw HashJoinFail(std::string(
"Could not build a 1-to-1 correspondence for columns "
88 "involved in equijoin | ") +
91 throw HashJoinFail(std::string(
"Could not build hash tables for equijoin | ") +
95 std::string(
"Ran out of memory while building hash tables for equijoin | ") +
99 }
catch (
const std::exception& e) {
100 throw std::runtime_error(
101 std::string(
"Fatal error while attempting to build hash tables for join: ") +
105 ts2 = std::chrono::steady_clock::now();
106 VLOG(1) <<
"Built keyed hash table "
108 << std::chrono::duration_cast<std::chrono::milliseconds>(ts2 - ts1).count()
111 return join_hash_table;
115 const std::shared_ptr<Analyzer::BinOper> condition,
117 const std::vector<InputTableInfo>& query_infos,
121 const std::vector<InnerOuter>& inner_outer_pairs,
122 const std::vector<InnerOuterStringOpInfos>& col_pairs_string_op_infos,
123 const int device_count,
127 : condition_(condition)
128 , join_type_(join_type)
129 , query_infos_(query_infos)
132 , column_cache_(column_cache)
133 , inner_outer_pairs_(inner_outer_pairs)
134 , inner_outer_string_op_infos_pairs_(col_pairs_string_op_infos)
136 , query_hints_(query_hints)
137 , needs_dict_translation_(
false)
138 , hashtable_build_dag_map_(hashtable_build_dag_map)
139 , table_id_to_node_map_(table_id_to_node_map) {
146 const Executor* executor,
147 const std::vector<InnerOuter>& inner_outer_pairs) {
148 for (
const auto& inner_outer_pair : inner_outer_pairs) {
149 const auto pair_shard_count =
get_shard_count(inner_outer_pair, executor);
150 if (pair_shard_count) {
151 return pair_shard_count;
164 auto buffer_size = hash_table->getHashTableBufferSize(device_type);
166 std::unique_ptr<int8_t[]> buffer_copy;
168 buffer_copy = std::make_unique<int8_t[]>(buffer_size);
171 auto device_allocator = std::make_unique<CudaAllocator>(
173 device_allocator->copyFromDevice(buffer_copy.get(), buffer, buffer_size);
175 auto ptr1 = buffer_copy ? buffer_copy.get() : buffer;
189 hash_table->getEntryCount(),
200 const int device_id)
const {
204 auto buffer_size = hash_table->getHashTableBufferSize(device_type);
206 std::unique_ptr<int8_t[]> buffer_copy;
208 buffer_copy = std::make_unique<int8_t[]>(buffer_size);
210 auto device_allocator = std::make_unique<CudaAllocator>(
212 device_allocator->copyFromDevice(buffer_copy.get(), buffer, buffer_size);
214 auto ptr1 = buffer_copy ? buffer_copy.get() : buffer;
221 const auto layout = hash_table->getLayout();
224 hash_table->getEntryCount(),
233 const std::vector<InnerOuter>& inner_outer_pairs,
234 const std::vector<InnerOuterStringOpInfos>& inner_outer_string_op_infos_pairs,
235 const Executor* executor) {
236 const auto num_col_pairs = inner_outer_pairs.size();
237 CHECK_EQ(num_col_pairs, inner_outer_string_op_infos_pairs.size());
238 for (
size_t col_pair_idx = 0; col_pair_idx < num_col_pairs; ++col_pair_idx) {
240 inner_outer_string_op_infos_pairs[col_pair_idx],
262 auto layout = preferred_layout;
274 }
catch (
const std::exception& e) {
275 VLOG(1) <<
"Caught exception while building overlaps baseline hash table: "
287 if (inner_outer_string_op_infos.first.size() ||
288 inner_outer_string_op_infos.second.size()) {
296 }
catch (
const std::exception& e) {
297 VLOG(1) <<
"Caught exception while building baseline hash table: " << e.what();
311 if (query_info.fragments.empty()) {
316 if (total_entries > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
320 std::vector<std::unique_ptr<CudaAllocator>> dev_buff_owners;
321 std::vector<std::vector<Fragmenter_Namespace::FragmentInfo>> fragments_per_device;
322 std::vector<ColumnsForDevice> columns_per_device;
324 auto entries_per_device =
329 std::vector<ChunkKey> chunk_key_per_device;
330 for (
int device_id = 0; device_id <
device_count_; ++device_id) {
331 fragments_per_device.emplace_back(
334 : query_info.fragments);
336 dev_buff_owners.emplace_back(std::make_unique<CudaAllocator>(
339 const auto chunk_key =
genChunkKey(fragments_per_device[device_id]);
340 chunk_key_per_device.emplace_back(std::move(chunk_key));
344 auto inner_outer_pairs =
346 const auto& inner_outer_cols = inner_outer_pairs.first;
347 const auto& col_pairs_string_op_infos = inner_outer_pairs.second;
348 auto hashtable_access_path_info =
350 col_pairs_string_op_infos,
356 fragments_per_device,
360 table_keys_ = hashtable_access_path_info.table_keys;
375 for (
int device_id = 0; device_id <
device_count_; ++device_id) {
376 const auto num_tuples =
std::accumulate(fragments_per_device[device_id].begin(),
377 fragments_per_device[device_id].end(),
379 [](
const auto& sum,
const auto& fragment) {
380 return sum + fragment.getNumTuples();
387 chunk_key_per_device[device_id]};
393 const auto invalid_cache_key =
395 if (!invalid_cache_key) {
400 std::for_each(hashtable_cache_key_.cbegin(),
401 hashtable_cache_key_.cend(),
423 auto allow_hashtable_recycling =
428 bool has_invalid_cached_hash_table =
false;
431 allow_hashtable_recycling, invalid_cache_key,
join_type_)) {
434 for (
int device_id = 0; device_id <
device_count_; ++device_id) {
442 has_invalid_cached_hash_table =
true;
447 if (has_invalid_cached_hash_table) {
454 for (
int device_id = 0; device_id <
device_count_; ++device_id) {
469 for (
int device_id = 0; device_id <
device_count_; ++device_id) {
470 const auto columns_for_device =
474 ? dev_buff_owners[device_id].
get()
476 columns_per_device.push_back(columns_for_device);
479 auto hashtable_layout_type = layout;
480 size_t emitted_keys_count = 0;
482 CHECK(!columns_per_device.front().join_columns.empty());
483 emitted_keys_count = columns_per_device.front().join_columns.front().num_elems;
486 const auto entry_count = 2 * std::max(tuple_count,
size_t(1));
492 std::vector<std::future<void>> init_threads;
493 for (
int device_id = 0; device_id <
device_count_; ++device_id) {
494 const auto fragments =
497 : query_info.fragments;
501 columns_per_device[device_id],
502 hashtable_layout_type,
508 for (
auto& init_thread : init_threads) {
511 for (
auto& init_thread : init_threads) {
517 const std::vector<ColumnsForDevice>& columns_per_device)
const {
528 const auto padded_size_bytes = count_distinct_desc.bitmapPaddedSizeBytes();
530 CHECK(!columns_per_device.empty() && !columns_per_device.front().join_columns.empty());
534 std::vector<uint8_t> hll_buffer_all_cpus(thread_count * padded_size_bytes);
535 auto hll_result = &hll_buffer_all_cpus[0];
538 count_distinct_desc.bitmap_sz_bits,
540 columns_per_device.front().join_columns,
541 columns_per_device.front().join_column_types,
543 for (
int i = 1; i < thread_count; ++i) {
545 hll_result + i * padded_size_bytes,
546 1 << count_distinct_desc.bitmap_sz_bits);
548 return std::make_pair(
hll_size(hll_result, count_distinct_desc.bitmap_sz_bits), 0);
552 std::vector<std::vector<uint8_t>> host_hll_buffers(
device_count_);
553 for (
auto& host_hll_buffer : host_hll_buffers) {
554 host_hll_buffer.resize(count_distinct_desc.bitmapPaddedSizeBytes());
556 std::vector<std::future<void>> approximate_distinct_device_threads;
557 for (
int device_id = 0; device_id <
device_count_; ++device_id) {
558 approximate_distinct_device_threads.emplace_back(
std::async(
562 &count_distinct_desc,
565 auto allocator = std::make_unique<CudaAllocator>(
567 auto device_hll_buffer =
568 allocator->alloc(count_distinct_desc.bitmapPaddedSizeBytes());
569 data_mgr->getCudaMgr()->zeroDeviceMem(
571 count_distinct_desc.bitmapPaddedSizeBytes(),
574 const auto& columns_for_device = columns_per_device[device_id];
576 columns_for_device.join_columns, *allocator);
578 columns_for_device.join_column_types, *allocator);
579 const auto key_handler =
583 join_column_types_gpu,
586 const auto key_handler_gpu =
589 reinterpret_cast<uint8_t*>(device_hll_buffer),
590 count_distinct_desc.bitmap_sz_bits,
592 columns_for_device.join_columns[0].num_elems);
594 auto& host_hll_buffer = host_hll_buffers[device_id];
595 allocator->copyFromDevice(&host_hll_buffer[0],
597 count_distinct_desc.bitmapPaddedSizeBytes());
600 for (
auto& child : approximate_distinct_device_threads) {
604 auto& result_hll_buffer = host_hll_buffers.front();
605 auto hll_result =
reinterpret_cast<int32_t*
>(&result_hll_buffer[0]);
606 for (
int device_id = 1; device_id <
device_count_; ++device_id) {
607 auto& host_hll_buffer = host_hll_buffers[device_id];
609 reinterpret_cast<int32_t*>(&host_hll_buffer[0]),
610 1 << count_distinct_desc.bitmap_sz_bits);
612 return std::make_pair(
hll_size(hll_result, count_distinct_desc.bitmap_sz_bits), 0);
620 const std::vector<Fragmenter_Namespace::FragmentInfo>& fragments,
623 const auto effective_memory_level =
626 std::vector<JoinColumn> join_columns;
627 std::vector<std::shared_ptr<Chunk_NS::Chunk>> chunks_owner;
628 std::vector<JoinColumnTypeInfo> join_column_types;
629 std::vector<JoinBucketInfo> join_bucket_info;
630 std::vector<std::shared_ptr<void>> malloc_owner;
632 const auto inner_col = inner_outer_pair.first;
634 if (inner_cd && inner_cd->isVirtualCol) {
639 effective_memory_level,
646 const auto& ti = inner_col->get_type_info();
655 return {join_columns, join_column_types, chunks_owner, join_bucket_info, malloc_owner};
662 const size_t entry_count,
663 const size_t emitted_keys_count,
667 const auto effective_memory_level =
673 effective_memory_level,
679 std::string(
"Unrecognized error when initializing baseline hash table (") +
694 const auto inner_col = inner_outer_pair.first;
695 const auto& inner_col_ti = inner_col->get_type_info();
696 if (inner_col_ti.get_logical_size() > 4) {
697 CHECK_EQ(8, inner_col_ti.get_logical_size());
709 const std::vector<InnerOuter>& inner_outer_pairs)
const {
719 std::shared_ptr<BaselineHashTable>& cpu_hash_table,
727 cpu_hash_table->getEntryCount(),
728 cpu_hash_table->getEmittedKeysCount(),
733 CHECK(gpu_target_hash_table);
735 const auto gpu_buff = gpu_target_hash_table->getGpuBuffer();
737 auto allocator = std::make_unique<CudaAllocator>(
739 allocator->copyToDevice(
741 cpu_hash_table->getCpuBuffer(),
747 const std::vector<const StringDictionaryProxy::IdMap*>& str_proxy_translation_maps) {
751 const size_t num_translation_maps = str_proxy_translation_maps.size();
752 translation_map_ptrs_and_offsets.first.reserve(num_translation_maps);
753 translation_map_ptrs_and_offsets.second.reserve(num_translation_maps);
754 for (
const auto& str_proxy_translation_map : str_proxy_translation_maps) {
755 if (str_proxy_translation_map) {
756 translation_map_ptrs_and_offsets.first.emplace_back(
757 str_proxy_translation_map->data());
758 translation_map_ptrs_and_offsets.second.emplace_back(
759 str_proxy_translation_map->domainStart());
762 translation_map_ptrs_and_offsets.first.emplace_back(
nullptr);
763 translation_map_ptrs_and_offsets.second.emplace_back(0);
766 return translation_map_ptrs_and_offsets;
770 const std::vector<JoinColumn>& join_columns,
771 const std::vector<JoinColumnTypeInfo>& join_column_types,
772 const std::vector<JoinBucketInfo>& join_bucket_info,
775 const size_t entry_count,
776 const size_t emitted_keys_count,
777 const int device_id) {
781 decltype(std::chrono::steady_clock::now()) ts1, ts2;
782 ts1 = std::chrono::steady_clock::now();
783 auto allow_hashtable_recycling =
795 CHECK(!join_columns.empty());
801 std::shared_ptr<HashTable> hash_table{
nullptr};
802 const auto str_proxy_translation_map_ptrs_and_offsets =
806 const auto key_handler =
810 &join_column_types[0],
811 &str_proxy_translation_map_ptrs_and_offsets.first[0],
812 &str_proxy_translation_map_ptrs_and_offsets.second[0]);
818 str_proxy_translation_map_ptrs_and_offsets,
820 join_columns.front().num_elems,
827 ts2 = std::chrono::steady_clock::now();
828 auto hashtable_build_time =
829 std::chrono::duration_cast<std::chrono::milliseconds>(ts2 - ts1).count();
836 hashtable_build_time);
866 auto join_column_types_gpu =
868 auto join_columns_gpu =
873 join_column_types_gpu,
890 if (!err && allow_hashtable_recycling && hash_tables_for_device_[device_id]) {
894 hash_tables_for_device_[device_id]->getLayout(),
908 #define LL_CONTEXT executor_->cgen_state_->context_
909 #define LL_BUILDER executor_->cgen_state_->ir_builder_
910 #define LL_INT(v) executor_->cgen_state_->llInt(v)
911 #define LL_FP(v) executor_->cgen_state_->llFp(v)
912 #define ROW_FUNC executor_->cgen_state_->row_func_
915 const size_t index) {
919 CHECK(key_component_width == 4 || key_component_width == 8);
921 const auto hash_ptr =
hashPtr(index);
922 const auto key_ptr_lv =
926 return executor_->cgen_state_->emitExternalCall(
927 "baseline_hash_join_idx_" +
std::to_string(key_component_width * 8),
929 {hash_ptr, key_ptr_lv, key_size_lv,
LL_INT(hash_table->getEntryCount())});
934 const size_t index) {
939 CHECK(key_component_width == 4 || key_component_width == 8);
943 const auto composite_dict_ptr_type =
944 llvm::Type::getIntNPtrTy(
LL_CONTEXT, key_component_width * 8);
945 const auto composite_key_dict =
946 hash_ptr->getType()->isPointerTy()
947 ?
LL_BUILDER.CreatePointerCast(hash_ptr, composite_dict_ptr_type)
948 :
LL_BUILDER.CreateIntToPtr(hash_ptr, composite_dict_ptr_type);
950 const auto key =
executor_->cgen_state_->emitExternalCall(
951 "get_composite_key_index_" +
std::to_string(key_component_width * 8),
954 LL_INT(key_component_count),
956 LL_INT(hash_table->getEntryCount())});
957 auto one_to_many_ptr = hash_ptr;
958 if (one_to_many_ptr->getType()->isPointerTy()) {
962 CHECK(one_to_many_ptr->getType()->isIntegerTy(64));
968 {one_to_many_ptr, key,
LL_INT(int64_t(0)),
LL_INT(hash_table->getEntryCount() - 1)},
998 CHECK(key_component_width == 4 || key_component_width == 8);
1003 return hash_table->getEntryCount() * key_component_count * key_component_width;
1005 return hash_table->getEntryCount() * (key_component_count + 1) * key_component_width;
1011 return hash_table->getEntryCount() *
sizeof(int32_t);
1017 CHECK(key_component_width == 4 || key_component_width == 8);
1019 llvm::Value* key_buff_lv{
nullptr};
1020 switch (key_component_width) {
1035 const auto key_comp_dest_lv =
LL_BUILDER.CreateGEP(
1036 key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1040 const auto outer_col = inner_outer_pair.second;
1042 const auto val_col_var =
1044 if (key_col_var && val_col_var &&
1049 throw std::runtime_error(
1050 "Query execution fails because the query contains not supported self-join "
1051 "pattern. We suspect the query requires multiple left-deep join tree due to "
1052 "the join condition of the self-join and is not supported for now. Please "
1053 "consider rewriting table order in "
1058 const auto key_lv_ext =
1060 LL_BUILDER.CreateStore(key_lv_ext, key_comp_dest_lv);
1068 const auto pi8_type = llvm::Type::getInt8PtrTy(
LL_CONTEXT);
1069 return hash_ptr->getType()->isPointerTy()
1070 ?
LL_BUILDER.CreatePointerCast(hash_ptr, pi8_type)
1071 :
LL_BUILDER.CreateIntToPtr(hash_ptr, pi8_type);
1091 return first_inner_col->get_rte_idx();
1100 return hash_table->getLayout();
1105 const std::vector<InnerOuter>& inner_outer_pairs) {
1106 CHECK(!inner_outer_pairs.empty());
1107 const auto first_inner_col = inner_outer_pairs.front().first;
1108 return first_inner_col->getTableKey();
1116 VLOG(1) <<
"Checking CPU hash table cache.";
1124 std::shared_ptr<HashTable> hashtable_ptr,
1126 size_t hashtable_building_time) {
1128 CHECK(hashtable_ptr && !hashtable_ptr->getGpuBuffer());
1135 hashtable_building_time);
1143 const std::vector<Fragmenter_Namespace::FragmentInfo>& fragments)
const {
1144 std::vector<int> fragment_ids;
1146 fragments.cbegin(), fragments.cend(), [&fragment_ids](
const auto& fragment) {
1147 fragment_ids.push_back(fragment.fragmentId);
1149 return fragment_ids;
bool needs_dict_translation_
size_t offsetBufferOff() const noexceptoverride
std::set< DecodedJoinHashBufferEntry > toSet(const ExecutorDeviceType device_type, const int device_id) const override
std::vector< int > ChunkKey
void putHashTableOnCpuToCache(QueryPlanHash key, CacheItemType item_type, std::shared_ptr< HashTable > hashtable_ptr, DeviceIdentifier device_identifier, size_t hashtable_building_time)
virtual HashJoinMatchingSet codegenMatchingSet(const CompilationOptions &, const size_t)=0
std::mutex str_proxy_translation_mutex_
std::string toString(const ExecutorDeviceType device_type, const int device_id=0, bool raw=false) const override
static llvm::Value * codegenHashTableLoad(const size_t table_idx, Executor *executor)
bool self_join_not_covered_by_left_deep_tree(const Analyzer::ColumnVar *key_side, const Analyzer::ColumnVar *val_side, const int max_rte_covered)
std::vector< QueryPlanHash > hashtable_cache_key_
static bool isInvalidHashTableCacheKey(const std::vector< QueryPlanHash > &cache_keys)
Data_Namespace::MemoryLevel getEffectiveMemoryLevel(const std::vector< InnerOuter > &inner_outer_pairs) const
static bool canAccessHashTable(bool allow_hash_table_recycling, bool invalid_cache_key, JoinType join_type)
T * transfer_flat_object_to_gpu(const T &object, DeviceAllocator &allocator)
static void checkHashJoinReplicationConstraint(const shared::TableKey &table_key, const size_t shard_count, const Executor *executor)
HashJoinMatchingSet codegenMatchingSet(const CompilationOptions &, const size_t) override
void hll_unify(T1 *lhs, T2 *rhs, const size_t m)
JoinColumn fetchJoinColumn(const Analyzer::ColumnVar *hash_col, const std::vector< Fragmenter_Namespace::FragmentInfo > &fragment_info, const Data_Namespace::MemoryLevel effective_memory_level, const int device_id, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, DeviceAllocator *dev_buff_owner, std::vector< std::shared_ptr< void >> &malloc_owner, Executor *executor, ColumnCacheMap *column_cache)
std::vector< std::shared_ptr< HashTable > > hash_tables_for_device_
Data_Namespace::MemoryLevel get_effective_memory_level(const Data_Namespace::MemoryLevel memory_level, const bool needs_dict_translation)
const InputTableInfo & get_inner_query_info(const shared::TableKey &inner_table_key, const std::vector< InputTableInfo > &query_infos)
int initHashTableOnGpu(KEY_HANDLER *key_handler, const std::vector< JoinColumn > &join_columns, const HashType layout, const JoinType join_type, const size_t key_component_width, const size_t key_component_count, const size_t keyspace_entry_count, const size_t emitted_keys_count, const int device_id, const Executor *executor, const RegisteredQueryHint &query_hint)
HashTableBuildDagMap hashtable_build_dag_map_
size_t getKeyBufferSize() const noexcept
#define DEBUG_TIMER_NEW_THREAD(parent_thread_id)
int initHashTableOnCpu(KEY_HANDLER *key_handler, const CompositeKeyInfo &composite_key_info, const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const StrProxyTranslationMapsPtrsAndOffsets &str_proxy_translation_maps_ptrs_and_offsets, const size_t keyspace_entry_count, const size_t keys_for_all_rows, const HashType layout, const JoinType join_type, const size_t key_component_width, const size_t key_component_count, const RegisteredQueryHint &query_hint)
const TableIdToNodeMap table_id_to_node_map_
size_t getComponentBufferSize() const noexceptoverride
RegisteredQueryHint query_hints_
bool needs_dictionary_translation(const std::vector< InnerOuter > &inner_outer_pairs, const std::vector< InnerOuterStringOpInfos > &inner_outer_string_op_infos_pairs, const Executor *executor)
static llvm::Value * codegenColOrStringOper(const Analyzer::Expr *col_or_string_oper, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos, CodeGenerator &code_generator, const CompilationOptions &co)
void freeHashBufferMemory()
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
size_t hll_size(const T *M, const size_t bitmap_sz_bits)
const int get_max_rte_scan_table(std::unordered_map< int, llvm::Value * > &scan_idx_to_hash_pos)
int getInnerTableRteIdx() const noexceptoverride
std::unordered_set< size_t > table_keys_
const JoinType join_type_
virtual ColumnsForDevice fetchColumnsForDevice(const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, DeviceAllocator *dev_buff_owner)
const std::vector< InputTableInfo > & query_infos_
virtual llvm::Value * codegenKey(const CompilationOptions &)
std::shared_ptr< HashTable > initHashTableOnCpuFromCache(QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier)
size_t payloadBufferOff() const noexceptoverride
std::vector< InnerOuter > inner_outer_pairs_
const std::vector< JoinColumnTypeInfo > join_column_types
void reify(const HashType preferred_layout)
void approximate_distinct_tuples(uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
future< Result > async(Fn &&fn, Args &&...args)
std::unordered_map< size_t, HashTableBuildDag > HashTableBuildDagMap
HashType getHashType() const noexceptoverride
static QueryPlanHash getAlternativeCacheKey(AlternativeCacheKeyForBaselineHashJoin &info)
std::vector< InnerOuterStringOpInfos > inner_outer_string_op_infos_pairs_
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
static std::unique_ptr< HashtableRecycler > hash_table_cache_
ColumnCacheMap & column_cache_
std::vector< Fragmenter_Namespace::FragmentInfo > only_shards_for_device(const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, const int device_count)
size_t shardCount() const
int8_t * getJoinHashBuffer(const ExecutorDeviceType device_type, const int device_id) const
DEVICE auto accumulate(ARGS &&...args)
BaselineJoinHashTable(const std::shared_ptr< Analyzer::BinOper > condition, const JoinType join_type, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, ColumnCacheMap &column_cache, Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs, const std::vector< InnerOuterStringOpInfos > &col_pairs_string_op_infos, const int device_count, const RegisteredQueryHint &query_hints, const HashTableBuildDagMap &hashtable_build_dag_map, const TableIdToNodeMap &table_id_to_node_map)
virtual std::pair< size_t, size_t > approximateTupleCount(const std::vector< ColumnsForDevice > &) const
static std::vector< const StringDictionaryProxy::IdMap * > translateCompositeStrDictProxies(const CompositeKeyInfo &composite_key_info, const std::vector< InnerOuterStringOpInfos > &string_op_infos_for_keys, const Executor *executor)
HashtableCacheMetaInfo hashtable_cache_meta_info_
static std::unordered_set< size_t > getAlternativeTableKeys(const std::vector< ChunkKey > &chunk_keys, const shared::TableKey &inner_table_key)
void allocateDeviceMemory(const HashType layout, const size_t key_component_width, const size_t key_component_count, const size_t keyspace_entry_count, const size_t emitted_keys_count, const int device_id, const Executor *executor, const RegisteredQueryHint &query_hint)
virtual void reifyWithLayout(const HashType layout)
HashTable * getHashTableForDevice(const size_t device_id) const
std::unordered_map< shared::TableKey, const RelAlgNode * > TableIdToNodeMap
std::pair< std::vector< const int32_t * >, std::vector< int32_t >> StrProxyTranslationMapsPtrsAndOffsets
virtual void reifyForDevice(const ColumnsForDevice &columns_for_device, const HashType layout, const int device_id, const size_t entry_count, const size_t emitted_keys_count, const logger::ThreadLocalIds parent_thread_local_ids)
std::unique_ptr< BaselineHashTable > getHashTable()
static std::string getHashTypeString(HashType ht) noexcept
std::optional< HashType > layout_override_
static std::string toString(const std::string &type, const std::string &layout_type, size_t key_component_count, size_t key_component_width, size_t entry_count, const int8_t *ptr1, const int8_t *ptr2, const int8_t *ptr3, const int8_t *ptr4, size_t buffer_size, bool raw=false)
Decode hash table into a human-readable string.
LocalIdsScopeGuard setNewThreadId() const
size_t get_entries_per_device(const size_t total_entries, const size_t shard_count, const size_t device_count, const Data_Namespace::MemoryLevel memory_level)
static std::shared_ptr< BaselineJoinHashTable > getInstance(const std::shared_ptr< Analyzer::BinOper > condition, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hints, const TableIdToNodeMap &table_id_to_node_map)
Make hash table from an in-flight SQL query's parse tree etc.
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
CUstream getQueryEngineCudaStreamForDevice(int device_num)
const Data_Namespace::MemoryLevel memory_level_
llvm::Value * hashPtr(const size_t index)
virtual int initHashTableForDevice(const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_buckets, const HashType layout, const Data_Namespace::MemoryLevel effective_memory_level, const size_t entry_count, const size_t emitted_keys_count, const int device_id)
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
size_t getNumTuplesUpperBound() const
ColumnType get_join_column_type_kind(const SQLTypeInfo &ti)
bool g_enable_watchdog false
llvm::Value * codegenSlot(const CompilationOptions &, const size_t) override
bool isBitwiseEq() const override
#define DEBUG_TIMER(name)
static bool isSafeToCacheHashtable(const TableIdToNodeMap &table_id_to_node_map, bool need_dict_translation, const std::vector< InnerOuterStringOpInfos > &inner_outer_string_op_info_pairs, const shared::TableKey &table_key)
static std::pair< std::vector< InnerOuter >, std::vector< InnerOuterStringOpInfos > > normalizeColumnPairs(const Analyzer::BinOper *condition, const TemporaryTables *temporary_tables)
virtual size_t getKeyComponentCount() const
int64_t inline_fixed_encoding_null_val(const SQL_TYPE_INFO &ti)
void copyCpuHashTableToGpu(std::shared_ptr< BaselineHashTable > &cpu_hash_table, const int device_id, Data_Namespace::DataMgr *data_mgr)
ChunkKey genChunkKey(const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments) const
std::vector< const StringDictionaryProxy::IdMap * > str_proxy_translation_maps_
virtual size_t getKeyComponentWidth() const
std::mutex cpu_hash_table_buff_mutex_
device_count_(device_count)
static DecodedJoinHashBufferSet toSet(size_t key_component_count, size_t key_component_width, size_t entry_count, const int8_t *ptr1, const int8_t *ptr2, const int8_t *ptr3, const int8_t *ptr4, size_t buffer_size)
Decode hash table into a std::set for easy inspection and validation.
T * transfer_vector_of_flat_objects_to_gpu(const std::vector< T > &vec, DeviceAllocator &allocator)
Allocate GPU memory using GpuBuffers via DataMgr.
static size_t getShardCountForCondition(const Analyzer::BinOper *condition, const Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)
std::vector< JoinBucketInfo > join_buckets
static std::unique_ptr< HashingSchemeRecycler > hash_table_layout_cache_
static constexpr DeviceIdentifier CPU_DEVICE_IDENTIFIER
size_t get_shard_count(const Analyzer::BinOper *join_condition, const Executor *executor)
StrProxyTranslationMapsPtrsAndOffsets decomposeStrDictTranslationMaps(const std::vector< const StringDictionaryProxy::IdMap * > &str_proxy_translation_maps)
static HashtableAccessPathInfo getHashtableAccessPathInfo(const std::vector< InnerOuter > &inner_outer_pairs, const std::vector< InnerOuterStringOpInfos > &inner_outer_string_op_infos_pairs, const SQLOps op_type, const JoinType join_type, const HashTableBuildDagMap &hashtable_build_dag_map, int device_count, int shard_count, const std::vector< std::vector< Fragmenter_Namespace::FragmentInfo >> &frags_for_device, Executor *executor)
shared::TableKey getInnerTableId() const noexceptoverride
ThreadLocalIds thread_local_ids()
const std::vector< JoinColumn > join_columns
static bool layoutRequiresAdditionalBuffers(HashType layout) noexcept
const std::shared_ptr< Analyzer::BinOper > condition_
size_t countBufferOff() const noexceptoverride
static CompositeKeyInfo getCompositeKeyInfo(const std::vector< InnerOuter > &inner_outer_pairs, const Executor *executor, const std::vector< InnerOuterStringOpInfos > &inner_outer_string_op_infos_pairs={})
memory_level_(memory_level)