#include "HashJoinRuntime.h"
#include "QueryEngine/CompareKeysInl.h"
#include "QueryEngine/HyperLogLogRank.h"
#include "QueryEngine/JoinHashTable/Runtime/HashJoinKeyHandlers.h"
#include "QueryEngine/JoinHashTable/Runtime/JoinColumnIterator.h"
#include "QueryEngine/MurmurHash1Inl.h"
#include "Shared/shard_key.h"
#include "Logger/Logger.h"
#include "QueryEngine/RuntimeFunctions.h"
#include "Shared/likely.h"
#include "StringDictionary/StringDictionary.h"
#include "StringDictionary/StringDictionaryProxy.h"
#include <future>
#include "Shared/funcannotations.h"
#include <cmath>
#include <numeric>

Include dependency graph for HashJoinRuntime.cpp:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces
	anonymous_namespace{HashJoinRuntime.cpp}

Macros
#define	mapd_cas(address, compare, val) __sync_val_compare_and_swap(address, compare, val)

#define	cas_cst(ptr, expected, desired)

#define	store_cst(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

#define	load_cst(ptr) __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

#define	mapd_add(address, val) __sync_fetch_and_add(address, val)

Functions
int64_t	anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict (const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)

DEVICE void SUFFIX()	init_hash_join_buff (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto	fill_hash_join_buff_impl (OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)

DEVICE int SUFFIX()	fill_hash_join_buff_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)

DEVICE int SUFFIX()	fill_hash_join_buff_bitwise_eq (OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)

DEVICE int SUFFIX()	fill_hash_join_buff (OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename HASHTABLE_FILLING_FUNC >
DEVICE int	fill_hash_join_buff_sharded_impl (int32_t buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)

DEVICE int SUFFIX()	fill_hash_join_buff_sharded_bucketized (int32_t buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

DEVICE int SUFFIX()	fill_hash_join_buff_sharded (int32_t buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename T >
DEVICE void SUFFIX()	init_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename T >
T *	get_matching_baseline_hash_slot_at (int8_t hash_buff, const uint32_t h, const T key, const size_t key_component_count, const int64_t hash_entry_size)

template<typename T >
DEVICE int	write_baseline_hash_slot (const int32_t val, int8_t hash_buff, const int64_t entry_count, const T key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)

template<typename T >
DEVICE int	write_baseline_hash_slot_for_semi_join (const int32_t val, int8_t hash_buff, const int64_t entry_count, const T key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)

template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX()	fill_baseline_hash_join_buff (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename SLOT_SELECTOR >
DEVICE void	count_matches_impl (int32_t count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

GLOBAL void SUFFIX()	count_matches (int32_t count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

GLOBAL void SUFFIX()	count_matches_bucketized (int32_t count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

GLOBAL void SUFFIX()	count_matches_sharded (int32_t count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename T >
DEVICE NEVER_INLINE const T *SUFFIX()	get_matching_baseline_hash_slot_readonly (const T key, const size_t key_component_count, const T composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX()	count_matches_baseline (int32_t count_buff, const T composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename SLOT_SELECTOR >
DEVICE void	fill_row_ids_impl (int32_t buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

template<typename SLOT_SELECTOR >
DEVICE void	fill_row_ids_for_window_framing_impl (int32_t buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

GLOBAL void SUFFIX()	fill_row_ids (int32_t buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

GLOBAL void SUFFIX()	fill_row_ids_bucketized (int32_t buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

template<typename SLOT_SELECTOR >
DEVICE void	fill_row_ids_sharded_impl (int32_t buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

GLOBAL void SUFFIX()	fill_row_ids_sharded (int32_t buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

GLOBAL void SUFFIX()	fill_row_ids_sharded_bucketized (int32_t buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX()	fill_row_ids_baseline (int32_t buff, const T composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *f, const int64_t num_elems, const bool for_window_framing, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename KEY_HANDLER >
GLOBAL void SUFFIX()	approximate_distinct_tuples_impl (uint8_t hll_buffer, int32_t row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<size_t N>
GLOBAL void SUFFIX()	compute_bucket_sizes_impl (double bucket_sizes_for_thread, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const double bucket_size_thresholds, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename InputIterator , typename OutputIterator >
void	inclusive_scan (InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void	fill_one_to_many_hash_table_impl (int32_t buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)

void	fill_one_to_many_hash_table (OneToManyPerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_count)

void	fill_one_to_many_hash_table_bucketized (OneToManyPerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_count)

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void	fill_one_to_many_hash_table_sharded_impl (int32_t buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)

void	fill_one_to_many_hash_table_sharded (int32_t buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count)

void	init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

void	init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

int	fill_baseline_hash_join_buff_32 (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

int	bbox_intersect_fill_baseline_hash_join_buff_32 (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const BoundingBoxIntersectKeyHandler key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

int	range_fill_baseline_hash_join_buff_32 (int8_t hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

int	fill_baseline_hash_join_buff_64 (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

int	bbox_intersect_fill_baseline_hash_join_buff_64 (int8_t hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const BoundingBoxIntersectKeyHandler key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

int	range_fill_baseline_hash_join_buff_64 (int8_t hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

template<typename T >
void	fill_one_to_many_baseline_hash_table (int32_t buff, const T composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const size_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)

void	fill_one_to_many_baseline_hash_table_32 (int32_t buff, const int32_t composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)

void	fill_one_to_many_baseline_hash_table_64 (int32_t buff, const int64_t composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)

void	approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)

void	approximate_distinct_tuples_bbox_intersect (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)

void	approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)

void	compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)

Macro Definition Documentation

#define cas_cst	(	ptr,
		expected,
		desired
	)

Value:

__atomic_compare_exchange_n( \

ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

Definition at line 458 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define load_cst ( ptr ) __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

Definition at line 462 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define mapd_add	(	address,
		val
	)	__sync_fetch_and_add(address, val)

Definition at line 650 of file HashJoinRuntime.cpp.

Referenced by count_matches_baseline(), count_matches_impl(), count_matches_sharded(), fill_row_ids_baseline(), fill_row_ids_for_window_framing_impl(), fill_row_ids_impl(), and fill_row_ids_sharded_impl().

#define mapd_cas	(	address,
		compare,
		val
	)	__sync_val_compare_and_swap(address, compare, val)

Definition at line 116 of file HashJoinRuntime.cpp.

Referenced by write_baseline_hash_slot(), and write_baseline_hash_slot_for_semi_join().

#define store_cst	(	ptr,
		val
	)	__atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

Definition at line 461 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

Function Documentation

void approximate_distinct_tuples	(	uint8_t *	hll_buffer_all_cpus,
		const uint32_t	b,
		const size_t	padded_size_bytes,
		const std::vector< JoinColumn > &	join_column_per_key,
		const std::vector< JoinColumnTypeInfo > &	type_info_per_key,
		const int	thread_count
	)

Definition at line 2247 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

                                                          {
   CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
   CHECK(!join_column_per_key.empty());
 
   std::vector<std::future<void>> approx_distinct_threads;
   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
     approx_distinct_threads.push_back(std::async(
         std::launch::async,
         [&join_column_per_key,
          &type_info_per_key,
          b,
          hll_buffer_all_cpus,
          padded_size_bytes,
          thread_idx,
          thread_count] {
           auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
 
           const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
                                                      false,
                                                      &join_column_per_key[0],
                                                      &type_info_per_key[0],
                                                      nullptr,
                                                      nullptr);
           approximate_distinct_tuples_impl(hll_buffer,
                                            nullptr,
                                            b,
                                            join_column_per_key[0].num_elems,
                                            &key_handler,
                                            thread_idx,
                                            thread_count);
         }));
   }
   for (auto& child : approx_distinct_threads) {
     child.get();
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void approximate_distinct_tuples_bbox_intersect	(	uint8_t *	hll_buffer_all_cpus,
		std::vector< int32_t > &	row_counts,
		const uint32_t	b,
		const size_t	padded_size_bytes,
		const std::vector< JoinColumn > &	join_column_per_key,
		const std::vector< JoinColumnTypeInfo > &	type_info_per_key,
		const std::vector< JoinBucketInfo > &	join_buckets_per_key,
		const int	thread_count
	)

Definition at line 2289 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by BoundingBoxIntersectJoinHashTable::approximateTupleCount().

                             {
   CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
   CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
   CHECK(!join_column_per_key.empty());
 
   std::vector<std::future<void>> approx_distinct_threads;
   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
     approx_distinct_threads.push_back(std::async(
         std::launch::async,
         [&join_column_per_key,
          &join_buckets_per_key,
          &row_counts,
          b,
          hll_buffer_all_cpus,
          padded_size_bytes,
          thread_idx,
          thread_count] {
           auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
 
           const auto key_handler = BoundingBoxIntersectKeyHandler(
               join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
               &join_column_per_key[0],
               join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
           approximate_distinct_tuples_impl(hll_buffer,
                                            row_counts.data(),
                                            b,
                                            join_column_per_key[0].num_elems,
                                            &key_handler,
                                            thread_idx,
                                            thread_count);
         }));
   }
   for (auto& child : approx_distinct_threads) {
     child.get();
   }
 
   ::inclusive_scan(
       row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename KEY_HANDLER >

GLOBAL void SUFFIX() approximate_distinct_tuples_impl	(	uint8_t *	hll_buffer,
		int32_t *	row_count_buffer,
		const uint32_t	b,
		const int64_t	num_elems,
		const KEY_HANDLER *	f,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1318 of file HashJoinRuntime.cpp.

References atomicMax(), g_maximum_conditions_to_coalesce, get_rank(), and MurmurHash64AImpl().

Referenced by approximate_distinct_tuples(), approximate_distinct_tuples_bbox_intersect(), and approximate_distinct_tuples_range().

   {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
 
   auto key_buff_handler = [b, hll_buffer, row_count_buffer](
                               const int64_t entry_idx,
                               const int64_t* key_scratch_buff,
                               const size_t key_component_count) {
     if (row_count_buffer) {
       row_count_buffer[entry_idx] += 1;
     }
 
     const uint64_t hash =
         MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);
     const uint32_t index = hash >> (64 - b);
     const auto rank = get_rank(hash << b, 64 - b);
 #ifdef __CUDACC__
     atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);
 #else
     hll_buffer[index] = std::max(hll_buffer[index], rank);
 #endif
 
     return 0;
   };
 
   int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];
 
   JoinColumnTuple cols(
       f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
   for (auto& it : cols.slice(start, step)) {
     (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void approximate_distinct_tuples_range	(	uint8_t *	hll_buffer_all_cpus,
		std::vector< int32_t > &	row_counts,
		const uint32_t	b,
		const size_t	padded_size_bytes,
		const std::vector< JoinColumn > &	join_column_per_key,
		const std::vector< JoinColumnTypeInfo > &	type_info_per_key,
		const std::vector< JoinBucketInfo > &	join_buckets_per_key,
		const bool	is_compressed,
		const int	thread_count
	)

Definition at line 2337 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

                             {
   CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
   CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
   CHECK(!join_column_per_key.empty());
 
   std::vector<std::future<void>> approx_distinct_threads;
   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
     approx_distinct_threads.push_back(std::async(
         std::launch::async,
         [&join_column_per_key,
          &join_buckets_per_key,
          &row_counts,
          b,
          hll_buffer_all_cpus,
          padded_size_bytes,
          thread_idx,
          is_compressed,
          thread_count] {
           auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
 
           const auto key_handler = RangeKeyHandler(
               is_compressed,
               join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
               &join_column_per_key[0],
               join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
           approximate_distinct_tuples_impl(hll_buffer,
                                            row_counts.data(),
                                            b,
                                            join_column_per_key[0].num_elems,
                                            &key_handler,
                                            thread_idx,
                                            thread_count);
         }));
   }
   for (auto& child : approx_distinct_threads) {
     child.get();
   }
 
   ::inclusive_scan(
       row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

int bbox_intersect_fill_baseline_hash_join_buff_32	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const BoundingBoxIntersectKeyHandler *	key_handler,
		const int64_t	num_elems,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1866 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

                                     {
   return fill_baseline_hash_join_buff<int32_t>(hash_buff,
                                                entry_count,
                                                invalid_slot_val,
                                                false,
                                                key_component_count,
                                                with_val_slot,
                                                key_handler,
                                                num_elems,
                                                cpu_thread_idx,
                                                cpu_thread_count);
 }

Here is the caller graph for this function:

int bbox_intersect_fill_baseline_hash_join_buff_64	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const BoundingBoxIntersectKeyHandler *	key_handler,
		const int64_t	num_elems,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1931 of file HashJoinRuntime.cpp.

                                     {
   return fill_baseline_hash_join_buff<int64_t>(hash_buff,
                                                entry_count,
                                                invalid_slot_val,
                                                false,
                                                key_component_count,
                                                with_val_slot,
                                                key_handler,
                                                num_elems,
                                                cpu_thread_idx,
                                                cpu_thread_count);
 }

template<size_t N>

GLOBAL void SUFFIX() compute_bucket_sizes_impl	(	double *	bucket_sizes_for_thread,
		const JoinColumn *	join_column,
		const JoinColumnTypeInfo *	type_info,
		const double *	bucket_size_thresholds,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1388 of file HashJoinRuntime.cpp.

References atomicMin(), fixed_width_double_decode_noinline(), anonymous_namespace{Utm.h}::N, JoinColumnIterator::ptr(), and SUFFIX.

   {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   JoinColumnIterator it(join_column, type_info, start, step);
   for (; it; ++it) {
     // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)
     double bounds[2 * N];
     for (size_t j = 0; j < 2 * N; j++) {
       bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);
     }
 
     for (size_t j = 0; j < N; j++) {
       const auto diff = bounds[j + N] - bounds[j];
 #ifdef __CUDACC__
       if (diff > bucket_size_thresholds[j]) {
         atomicMin(&bucket_sizes_for_thread[j], diff);
       }
 #else
       if (diff < bucket_size_thresholds[j] && diff > bucket_sizes_for_thread[j]) {
         bucket_sizes_for_thread[j] = diff;
       }
 #endif
     }
   }
 }

Here is the call graph for this function:

void compute_bucket_sizes_on_cpu	(	std::vector< double > &	bucket_sizes_for_dimension,
		const JoinColumn &	join_column,
		const JoinColumnTypeInfo &	type_info,
		const std::vector< double > &	bucket_size_thresholds,
		const int	thread_count
	)

Definition at line 2388 of file HashJoinRuntime.cpp.

References threading_serial::async().

Referenced by anonymous_namespace{BoundingBoxIntersectJoinHashTable.cpp}::compute_bucket_sizes().

                                                          {
   std::vector<std::vector<double>> bucket_sizes_for_threads;
   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
     bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
   }
   std::vector<std::future<void>> threads;
   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
     threads.push_back(std::async(std::launch::async,
                                  compute_bucket_sizes_impl<2>,
                                  bucket_sizes_for_threads[thread_idx].data(),
                                  &join_column,
                                  &type_info,
                                  bucket_size_thresholds.data(),
                                  thread_idx,
                                  thread_count));
   }
   for (auto& child : threads) {
     child.get();
   }
 
   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
     for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
       if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
         bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
       }
     }
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches	(	int32_t *	count_buff,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 702 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

   {
   auto slot_sel = [&type_info](auto count_buff, auto elem) {
     return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);
   };
   count_matches_impl(count_buff,
                      join_column,
                      type_info
 #ifndef __CUDACC__
                      ,
                      sd_inner_to_outer_translation_map,
                      min_inner_elem,
                      cpu_thread_idx,
                      cpu_thread_count
 #endif
                      ,
                      slot_sel);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >

GLOBAL void SUFFIX() count_matches_baseline	(	int32_t *	count_buff,
		const T *	composite_key_dict,
		const int64_t	entry_count,
		const KEY_HANDLER *	f,
		const int64_t	num_elems,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 847 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and heavydb.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

   {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
 #ifdef __CUDACC__
   assert(composite_key_dict);
 #endif
   T key_scratch_buff[g_maximum_conditions_to_coalesce];
   const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
   auto key_buff_handler = [composite_key_dict,
                            entry_count,
                            count_buff,
                            key_size_in_bytes](const int64_t row_entry_idx,
                                               const T* key_scratch_buff,
                                               const size_t key_component_count) {
     const auto matching_group =
         SUFFIX(get_matching_baseline_hash_slot_readonly)(key_scratch_buff,
                                                          key_component_count,
                                                          composite_key_dict,
                                                          entry_count,
                                                          key_size_in_bytes);
     const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
     mapd_add(&count_buff[entry_idx], int32_t(1));
     return 0;
   };
 
   JoinColumnTuple cols(
       f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
   for (auto& it : cols.slice(start, step)) {
     (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_bucketized	(	int32_t *	count_buff,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		const int64_t	bucket_normalization
	)

Definition at line 730 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

                                         {
   auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {
     return SUFFIX(get_bucketized_hash_slot)(count_buff,
                                             elem,
                                             type_info.min_val / bucket_normalization,
                                             type_info.translated_null_val,
                                             bucket_normalization);
   };
   count_matches_impl(count_buff,
                      join_column,
                      type_info
 #ifndef __CUDACC__
                      ,
                      sd_inner_to_outer_translation_map,
                      min_inner_elem,
                      cpu_thread_idx,
                      cpu_thread_count
 #endif
                      ,
                      slot_sel);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename SLOT_SELECTOR >

DEVICE void count_matches_impl	(	int32_t *	count_buff,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		SLOT_SELECTOR	slot_selector
	)

Definition at line 654 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by count_matches(), and count_matches_bucketized().

                                                             {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   JoinColumnTyped col{&join_column, &type_info};
   for (auto item : col.slice(start, step)) {
     int64_t elem = item.element;
     if (elem == type_info.null_val) {
       if (type_info.uses_bw_eq) {
         elem = type_info.translated_null_val;
       } else {
         continue;
       }
     }
 #ifndef __CUDACC__
     if (sd_inner_to_outer_translation_map &&
         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
       const auto outer_id = map_str_id_to_outer_dict(elem,
                                                      min_inner_elem,
                                                      type_info.min_val,
                                                      type_info.max_val,
                                                      sd_inner_to_outer_translation_map);
       if (outer_id == StringDictionary::INVALID_STR_ID) {
         continue;
       }
       elem = outer_id;
     }
 #endif
     auto* entry_ptr = slot_selector(count_buff, elem);
     mapd_add(entry_ptr, int32_t(1));
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_sharded	(	int32_t *	count_buff,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const ShardInfo	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 764 of file HashJoinRuntime.cpp.

References get_hash_slot_sharded(), StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

   {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   JoinColumnTyped col{&join_column, &type_info};
   for (auto item : col.slice(start, step)) {
     int64_t elem = item.element;
     if (elem == type_info.null_val) {
       if (type_info.uses_bw_eq) {
         elem = type_info.translated_null_val;
       } else {
         continue;
       }
     }
 #ifndef __CUDACC__
     if (sd_inner_to_outer_translation_map &&
         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
       const auto outer_id = map_str_id_to_outer_dict(elem,
                                                      min_inner_elem,
                                                      type_info.min_val,
                                                      type_info.max_val,
                                                      sd_inner_to_outer_translation_map);
       if (outer_id == StringDictionary::INVALID_STR_ID) {
         continue;
       }
       elem = outer_id;
     }
 #endif
     int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,
                                                        elem,
                                                        type_info.min_val,
                                                        shard_info.entry_count_per_shard,
                                                        shard_info.num_shards,
                                                        shard_info.device_count);
     mapd_add(entry_ptr, int32_t(1));
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T , typename FILL_HANDLER >

DEVICE int SUFFIX() fill_baseline_hash_join_buff	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const bool	for_semi_join,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const FILL_HANDLER *	f,
		const int64_t	num_elems,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 576 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, and heavydb.dtypes::T.

                                                                                 {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
 
   T key_scratch_buff[g_maximum_conditions_to_coalesce];
   const size_t key_size_in_bytes = key_component_count * sizeof(T);
   const size_t hash_entry_size =
       (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
   auto key_buff_handler = [hash_buff,
                            entry_count,
                            with_val_slot,
                            invalid_slot_val,
                            key_size_in_bytes,
                            hash_entry_size,
                            &for_semi_join](const int64_t entry_idx,
                                            const T* key_scratch_buffer,
                                            const size_t key_component_count) {
     if (for_semi_join) {
       return write_baseline_hash_slot_for_semi_join<T>(entry_idx,
                                                        hash_buff,
                                                        entry_count,
                                                        key_scratch_buffer,
                                                        key_component_count,
                                                        with_val_slot,
                                                        invalid_slot_val,
                                                        key_size_in_bytes,
                                                        hash_entry_size);
     } else {
       return write_baseline_hash_slot<T>(entry_idx,
                                          hash_buff,
                                          entry_count,
                                          key_scratch_buffer,
                                          key_component_count,
                                          with_val_slot,
                                          invalid_slot_val,
                                          key_size_in_bytes,
                                          hash_entry_size);
     }
   };
 
   JoinColumnTuple cols(
       f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
   for (auto& it : cols.slice(start, step)) {
     const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
     if (err) {
       return err;
     }
   }
   return 0;
 }

int fill_baseline_hash_join_buff_32	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const bool	for_semi_join,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const GenericKeyHandler *	key_handler,
		const int64_t	num_elems,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1844 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

                                                                     {
   return fill_baseline_hash_join_buff<int32_t>(hash_buff,
                                                entry_count,
                                                invalid_slot_val,
                                                for_semi_join,
                                                key_component_count,
                                                with_val_slot,
                                                key_handler,
                                                num_elems,
                                                cpu_thread_idx,
                                                cpu_thread_count);
 }

Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const int32_t	invalid_slot_val,
		const bool	for_semi_join,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const GenericKeyHandler *	key_handler,
		const int64_t	num_elems,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1909 of file HashJoinRuntime.cpp.

                                                                     {
   return fill_baseline_hash_join_buff<int64_t>(hash_buff,
                                                entry_count,
                                                invalid_slot_val,
                                                for_semi_join,
                                                key_component_count,
                                                with_val_slot,
                                                key_handler,
                                                num_elems,
                                                cpu_thread_idx,
                                                cpu_thread_count);
 }

DEVICE int SUFFIX() fill_hash_join_buff	(	OneToOnePerfectJoinHashTableFillFuncArgs const	args,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 203 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

                                     {
   auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
                                          : SUFFIX(fill_one_to_one_hashtable);
   auto hashtable_filling_func = [&](auto elem, size_t index) {
     auto entry_ptr = SUFFIX(get_hash_slot)(args.buff, elem, args.type_info.min_val);
     return filling_func(index, entry_ptr, args.invalid_slot_val);
   };
 
   return fill_hash_join_buff_impl(
       args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bitwise_eq	(	OneToOnePerfectJoinHashTableFillFuncArgs const	args,
		int32_t const	cpu_thread_idx,
		int32_t const	cpu_thread_count
	)

Definition at line 187 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_bitwise_eq(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

                                     {
   auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
                                          : SUFFIX(fill_one_to_one_hashtable);
   auto hashtable_filling_func = [&](auto elem, size_t index) {
     auto entry_ptr = SUFFIX(get_hash_slot_bitwise_eq)(
         args.buff, elem, args.type_info.min_val, args.type_info.translated_null_val);
     return filling_func(index, entry_ptr, args.invalid_slot_val);
   };
 
   return fill_hash_join_buff_impl(
       args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bucketized	(	OneToOnePerfectJoinHashTableFillFuncArgs const	args,
		int32_t const	cpu_thread_idx,
		int32_t const	cpu_thread_count
	)

Definition at line 167 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

                                     {
   auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
                                          : SUFFIX(fill_one_to_one_hashtable);
   auto hashtable_filling_func = [&](auto elem, size_t index) {
     auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(
         args.buff,
         elem,
         args.type_info.min_val / args.bucket_normalization,
         args.type_info.translated_null_val,
         args.bucket_normalization);
     return filling_func(index, entry_ptr, args.invalid_slot_val);
   };
 
   return fill_hash_join_buff_impl(
       args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >

DEVICE auto fill_hash_join_buff_impl	(	OneToOnePerfectJoinHashTableFillFuncArgs const	args,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		HASHTABLE_FILLING_FUNC	filling_func
	)

Definition at line 120 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, OneToOnePerfectJoinHashTableFillFuncArgs::join_column, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), OneToOnePerfectJoinHashTableFillFuncArgs::min_inner_elem, OneToOnePerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map, and OneToOnePerfectJoinHashTableFillFuncArgs::type_info.

Referenced by fill_hash_join_buff(), fill_hash_join_buff_bitwise_eq(), and fill_hash_join_buff_bucketized().

                                                                           {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   auto const join_column = args.join_column;
   auto const type_info = args.type_info;
   JoinColumnTyped col{&join_column, &type_info};
   for (auto item : col.slice(start, step)) {
     const size_t index = item.index;
     int64_t elem = item.element;
     if (elem == type_info.null_val) {
       if (type_info.uses_bw_eq) {
         elem = type_info.translated_null_val;
       } else {
         continue;
       }
     }
 #ifndef __CUDACC__
     auto const sd_inner_to_outer_translation_map = args.sd_inner_to_outer_translation_map;
     auto const min_inner_elem = args.min_inner_elem;
     if (sd_inner_to_outer_translation_map &&
         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
       const auto outer_id = map_str_id_to_outer_dict(elem,
                                                      min_inner_elem,
                                                      type_info.min_val,
                                                      type_info.max_val,
                                                      sd_inner_to_outer_translation_map);
       if (outer_id == StringDictionary::INVALID_STR_ID) {
         continue;
       }
       elem = outer_id;
     }
 #endif
     if (filling_func(elem, index)) {
       return -1;
     }
   }
   return 0;
 };

Here is the call graph for this function:

Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded	(	int32_t *	buff,
		const int32_t	invalid_slot_val,
		const bool	for_semi_join,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const ShardInfo	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 311 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded().

                                     {
   auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
                                     : SUFFIX(fill_one_to_one_hashtable);
   auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
     auto entry_ptr = SUFFIX(get_hash_slot_sharded_opt)(buff,
                                                        elem,
                                                        type_info.min_val,
                                                        shard_info.entry_count_per_shard,
                                                        shard,
                                                        shard_info.num_shards,
                                                        shard_info.device_count);
     return filling_func(index, entry_ptr, invalid_slot_val);
   };
 
   return fill_hash_join_buff_sharded_impl(buff,
                                           join_column,
                                           type_info,
                                           shard_info,
                                           sd_inner_to_outer_translation_map,
                                           min_inner_elem,
                                           cpu_thread_idx,
                                           cpu_thread_count,
                                           hashtable_filling_func);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized	(	int32_t *	buff,
		const int32_t	invalid_slot_val,
		const bool	for_semi_join,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const ShardInfo	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		const int64_t	bucket_normalization
	)

Definition at line 272 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded_bucketized().

                                         {
   auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
                                     : SUFFIX(fill_one_to_one_hashtable);
   auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
     auto entry_ptr = SUFFIX(get_bucketized_hash_slot_sharded_opt)(
         buff,
         elem,
         type_info.min_val / bucket_normalization,
         type_info.translated_null_val,
         shard_info.entry_count_per_shard,
         shard,
         shard_info.num_shards,
         shard_info.device_count,
         bucket_normalization);
     return filling_func(index, entry_ptr, invalid_slot_val);
   };
 
   return fill_hash_join_buff_sharded_impl(buff,
                                           join_column,
                                           type_info,
                                           shard_info,
                                           sd_inner_to_outer_translation_map,
                                           min_inner_elem,
                                           cpu_thread_idx,
                                           cpu_thread_count,
                                           hashtable_filling_func);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >

DEVICE int fill_hash_join_buff_sharded_impl	(	int32_t *	buff,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const ShardInfo	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		HASHTABLE_FILLING_FUNC	filling_func
	)

Definition at line 219 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, ShardInfo::num_shards, ShardInfo::shard, SHARD_FOR_KEY, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_sharded(), and fill_hash_join_buff_sharded_bucketized().

                                          {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   JoinColumnTyped col{&join_column, &type_info};
   for (auto item : col.slice(start, step)) {
     const size_t index = item.index;
     int64_t elem = item.element;
     size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);
     if (shard != shard_info.shard) {
       continue;
     }
     if (elem == type_info.null_val) {
       if (type_info.uses_bw_eq) {
         elem = type_info.translated_null_val;
       } else {
         continue;
       }
     }
 #ifndef __CUDACC__
     if (sd_inner_to_outer_translation_map &&
         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
       const auto outer_id = map_str_id_to_outer_dict(elem,
                                                      min_inner_elem,
                                                      type_info.min_val,
                                                      type_info.max_val,
                                                      sd_inner_to_outer_translation_map);
       if (outer_id == StringDictionary::INVALID_STR_ID) {
         continue;
       }
       elem = outer_id;
     }
 #endif
     if (filling_func(elem, shard, index)) {
       return -1;
     }
   }
   return 0;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

void fill_one_to_many_baseline_hash_table	(	int32_t *	buff,
		const T *	composite_key_dict,
		const int64_t	hash_entry_count,
		const size_t	key_component_count,
		const std::vector< JoinColumn > &	join_column_per_key,
		const std::vector< JoinColumnTypeInfo > &	type_info_per_key,
		const std::vector< JoinBucketInfo > &	join_buckets_per_key,
		const std::vector< const int32_t * > &	sd_inner_to_outer_translation_maps,
		const std::vector< int32_t > &	sd_min_inner_elems,
		const size_t	cpu_thread_count,
		const bool	is_range_join,
		const bool	is_geo_compressed,
		const bool	for_window_framing
	)

Definition at line 1975 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, count_matches_baseline(), fill_row_ids_baseline(), inclusive_scan(), and SUFFIX.

                                    {
   int32_t* pos_buff = buff;
   int32_t* count_buff = buff + hash_entry_count;
   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
   std::vector<std::future<void>> counter_threads;
   for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     if (is_range_join) {
       counter_threads.push_back(std::async(
           std::launch::async,
           [count_buff,
            composite_key_dict,
            &hash_entry_count,
            &join_buckets_per_key,
            &join_column_per_key,
            &is_geo_compressed,
            cpu_thread_idx,
            cpu_thread_count] {
             const auto key_handler = RangeKeyHandler(
                 is_geo_compressed,
                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
                 &join_column_per_key[0],
                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
             count_matches_baseline(count_buff,
                                    composite_key_dict,
                                    hash_entry_count,
                                    &key_handler,
                                    join_column_per_key[0].num_elems,
                                    cpu_thread_idx,
                                    cpu_thread_count);
           }));
     } else if (join_buckets_per_key.size() > 0) {
       counter_threads.push_back(std::async(
           std::launch::async,
           [count_buff,
            composite_key_dict,
            &hash_entry_count,
            &join_buckets_per_key,
            &join_column_per_key,
            cpu_thread_idx,
            cpu_thread_count] {
             const auto key_handler = BoundingBoxIntersectKeyHandler(
                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
                 &join_column_per_key[0],
                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
             count_matches_baseline(count_buff,
                                    composite_key_dict,
                                    hash_entry_count,
                                    &key_handler,
                                    join_column_per_key[0].num_elems,
                                    cpu_thread_idx,
                                    cpu_thread_count);
           }));
     } else {
       counter_threads.push_back(
           std::async(std::launch::async,
                      [count_buff,
                       composite_key_dict,
                       &key_component_count,
                       &hash_entry_count,
                       &join_column_per_key,
                       &type_info_per_key,
                       &sd_inner_to_outer_translation_maps,
                       &sd_min_inner_elems,
                       cpu_thread_idx,
                       cpu_thread_count] {
                        const auto key_handler =
                            GenericKeyHandler(key_component_count,
                                              true,
                                              &join_column_per_key[0],
                                              &type_info_per_key[0],
                                              &sd_inner_to_outer_translation_maps[0],
                                              &sd_min_inner_elems[0]);
                        count_matches_baseline(count_buff,
                                               composite_key_dict,
                                               hash_entry_count,
                                               &key_handler,
                                               join_column_per_key[0].num_elems,
                                               cpu_thread_idx,
                                               cpu_thread_count);
                      }));
     }
   }
 
   for (auto& child : counter_threads) {
     child.get();
   }
 
   std::vector<int32_t> count_copy(hash_entry_count, 0);
   CHECK_GT(hash_entry_count, int64_t(0));
   memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
   ::inclusive_scan(
       count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
   std::vector<std::future<void>> pos_threads;
   for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     pos_threads.push_back(std::async(
         std::launch::async,
         [&](const int thread_idx) {
           for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
             if (count_buff[i]) {
               pos_buff[i] = count_copy[i];
             }
           }
         },
         cpu_thread_idx));
   }
   for (auto& child : pos_threads) {
     child.get();
   }
 
   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
   std::vector<std::future<void>> rowid_threads;
   for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     if (is_range_join) {
       rowid_threads.push_back(std::async(
           std::launch::async,
           [buff,
            composite_key_dict,
            hash_entry_count,
            &join_column_per_key,
            &join_buckets_per_key,
            &is_geo_compressed,
            cpu_thread_idx,
            cpu_thread_count] {
             const auto key_handler = RangeKeyHandler(
                 is_geo_compressed,
                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
                 &join_column_per_key[0],
                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
             SUFFIX(fill_row_ids_baseline)
             (buff,
              composite_key_dict,
              hash_entry_count,
              &key_handler,
              join_column_per_key[0].num_elems,
              false,
              cpu_thread_idx,
              cpu_thread_count);
           }));
     } else if (join_buckets_per_key.size() > 0) {
       rowid_threads.push_back(std::async(
           std::launch::async,
           [buff,
            composite_key_dict,
            hash_entry_count,
            &join_column_per_key,
            &join_buckets_per_key,
            for_window_framing,
            cpu_thread_idx,
            cpu_thread_count] {
             const auto key_handler = BoundingBoxIntersectKeyHandler(
                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
                 &join_column_per_key[0],
                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
             SUFFIX(fill_row_ids_baseline)
             (buff,
              composite_key_dict,
              hash_entry_count,
              &key_handler,
              join_column_per_key[0].num_elems,
              for_window_framing,
              cpu_thread_idx,
              cpu_thread_count);
           }));
     } else {
       rowid_threads.push_back(std::async(std::launch::async,
                                          [buff,
                                           composite_key_dict,
                                           hash_entry_count,
                                           key_component_count,
                                           &join_column_per_key,
                                           &type_info_per_key,
                                           &sd_inner_to_outer_translation_maps,
                                           &sd_min_inner_elems,
                                           for_window_framing,
                                           cpu_thread_idx,
                                           cpu_thread_count] {
                                            const auto key_handler = GenericKeyHandler(
                                                key_component_count,
                                                true,
                                                &join_column_per_key[0],
                                                &type_info_per_key[0],
                                                &sd_inner_to_outer_translation_maps[0],
                                                &sd_min_inner_elems[0]);
                                            SUFFIX(fill_row_ids_baseline)
                                            (buff,
                                             composite_key_dict,
                                             hash_entry_count,
                                             &key_handler,
                                             join_column_per_key[0].num_elems,
                                             for_window_framing,
                                             cpu_thread_idx,
                                             cpu_thread_count);
                                          }));
     }
   }
 
   for (auto& child : rowid_threads) {
     child.get();
   }
 }

Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32	(	int32_t *	buff,
		const int32_t *	composite_key_dict,
		const int64_t	hash_entry_count,
		const size_t	key_component_count,
		const std::vector< JoinColumn > &	join_column_per_key,
		const std::vector< JoinColumnTypeInfo > &	type_info_per_key,
		const std::vector< JoinBucketInfo > &	join_bucket_info,
		const std::vector< const int32_t * > &	sd_inner_to_outer_translation_maps,
		const std::vector< int32_t > &	sd_min_inner_elems,
		const int32_t	cpu_thread_count,
		const bool	is_range_join,
		const bool	is_geo_compressed,
		const bool	for_window_framing
	)

Definition at line 2189 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

                                    {
   fill_one_to_many_baseline_hash_table<int32_t>(buff,
                                                 composite_key_dict,
                                                 hash_entry_count,
                                                 key_component_count,
                                                 join_column_per_key,
                                                 type_info_per_key,
                                                 join_bucket_info,
                                                 sd_inner_to_outer_translation_maps,
                                                 sd_min_inner_elems,
                                                 cpu_thread_count,
                                                 is_range_join,
                                                 is_geo_compressed,
                                                 for_window_framing);
 }

Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64	(	int32_t *	buff,
		const int64_t *	composite_key_dict,
		const int64_t	hash_entry_count,
		const size_t	key_component_count,
		const std::vector< JoinColumn > &	join_column_per_key,
		const std::vector< JoinColumnTypeInfo > &	type_info_per_key,
		const std::vector< JoinBucketInfo > &	join_bucket_info,
		const std::vector< const int32_t * > &	sd_inner_to_outer_translation_maps,
		const std::vector< int32_t > &	sd_min_inner_elems,
		const int32_t	cpu_thread_count,
		const bool	is_range_join,
		const bool	is_geo_compressed,
		const bool	for_window_framing
	)

Definition at line 2218 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

                                    {
   fill_one_to_many_baseline_hash_table<int64_t>(buff,
                                                 composite_key_dict,
                                                 hash_entry_count,
                                                 key_component_count,
                                                 join_column_per_key,
                                                 type_info_per_key,
                                                 join_bucket_info,
                                                 sd_inner_to_outer_translation_maps,
                                                 sd_min_inner_elems,
                                                 cpu_thread_count,
                                                 is_range_join,
                                                 is_geo_compressed,
                                                 for_window_framing);
 }

Here is the caller graph for this function:

void fill_one_to_many_hash_table	(	OneToManyPerfectJoinHashTableFillFuncArgs const	args,
		const int32_t	cpu_thread_count
	)

Definition at line 1564 of file HashJoinRuntime.cpp.

References run_benchmark_import::args, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), OneToManyPerfectJoinHashTableFillFuncArgs::for_window_framing, OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, OneToManyPerfectJoinHashTableFillFuncArgs::min_inner_elem, OneToManyPerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

                                                                  {
   auto timer = DEBUG_TIMER(__func__);
   auto const buff = args.buff;
   auto const hash_entry_info = args.hash_entry_info;
   auto launch_count_matches = [count_buff =
                                    buff + hash_entry_info.bucketized_hash_entry_count,
                                &args](auto cpu_thread_idx, auto cpu_thread_count) {
     SUFFIX(count_matches)
     (count_buff,
      args.join_column,
      args.type_info,
      args.sd_inner_to_outer_translation_map,
      args.min_inner_elem,
      cpu_thread_idx,
      cpu_thread_count);
   };
   auto launch_fill_row_ids =
       [hash_entry_count = hash_entry_info.bucketized_hash_entry_count, buff, args](
           auto cpu_thread_idx, auto cpu_thread_count) {
         SUFFIX(fill_row_ids)
         (buff,
          hash_entry_count,
          args.join_column,
          args.type_info,
          args.for_window_framing,
          args.sd_inner_to_outer_translation_map,
          args.min_inner_elem,
          cpu_thread_idx,
          cpu_thread_count);
       };
 
   fill_one_to_many_hash_table_impl(buff,
                                    hash_entry_info.bucketized_hash_entry_count,
                                    args.join_column,
                                    args.type_info,
                                    args.sd_inner_to_outer_translation_map,
                                    args.min_inner_elem,
                                    cpu_thread_count,
                                    args.for_window_framing,
                                    launch_count_matches,
                                    launch_fill_row_ids);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized	(	OneToManyPerfectJoinHashTableFillFuncArgs const	args,
		const int32_t	cpu_thread_count
	)

Definition at line 1608 of file HashJoinRuntime.cpp.

References run_benchmark_import::args, BucketizedHashEntryInfo::bucket_normalization, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, OneToManyPerfectJoinHashTableFillFuncArgs::min_inner_elem, OneToManyPerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

                                     {
   auto timer = DEBUG_TIMER(__func__);
   auto const buff = args.buff;
   auto const hash_entry_info = args.hash_entry_info;
   auto bucket_normalization = hash_entry_info.bucket_normalization;
   auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
   auto launch_count_matches = [bucket_normalization,
                                count_buff = buff + hash_entry_count,
                                &args](auto cpu_thread_idx, auto cpu_thread_count) {
     SUFFIX(count_matches_bucketized)
     (count_buff,
      args.join_column,
      args.type_info,
      args.sd_inner_to_outer_translation_map,
      args.min_inner_elem,
      cpu_thread_idx,
      cpu_thread_count,
      bucket_normalization);
   };
   auto launch_fill_row_ids = [bucket_normalization, hash_entry_count, buff, args](
                                  auto cpu_thread_idx, auto cpu_thread_count) {
     SUFFIX(fill_row_ids_bucketized)
     (buff,
      hash_entry_count,
      args.join_column,
      args.type_info,
      args.sd_inner_to_outer_translation_map,
      args.min_inner_elem,
      cpu_thread_idx,
      cpu_thread_count,
      bucket_normalization);
   };
 
   fill_one_to_many_hash_table_impl(buff,
                                    hash_entry_count,
                                    args.join_column,
                                    args.type_info,
                                    args.sd_inner_to_outer_translation_map,
                                    args.min_inner_elem,
                                    cpu_thread_count,
                                    false,
                                    launch_count_matches,
                                    launch_fill_row_ids);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >

void fill_one_to_many_hash_table_impl	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn &	join_column,
		const JoinColumnTypeInfo &	type_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_count,
		const bool	for_window_framing,
		COUNT_MATCHES_LAUNCH_FUNCTOR	count_matches_func,
		FILL_ROW_IDS_LAUNCH_FUNCTOR	fill_row_ids_func
	)

Definition at line 1503 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, and inclusive_scan().

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_bucketized().

                                                                                      {
   auto timer = DEBUG_TIMER(__func__);
   int32_t* pos_buff = buff;
   int32_t* count_buff = buff + hash_entry_count;
   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
   std::vector<std::future<void>> counter_threads;
   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     counter_threads.push_back(std::async(
         std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));
   }
 
   for (auto& child : counter_threads) {
     child.get();
   }
 
   std::vector<int32_t> count_copy(hash_entry_count, 0);
   CHECK_GT(hash_entry_count, int64_t(0));
   memcpy(count_copy.data() + 1, count_buff, (hash_entry_count - 1) * sizeof(int32_t));
 #if HAVE_CUDA
   thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());
 #else
   ::inclusive_scan(
       count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
 #endif
   std::vector<std::future<void>> pos_threads;
   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     pos_threads.push_back(std::async(
         std::launch::async,
         [&](size_t thread_idx) {
           for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
             if (count_buff[i]) {
               pos_buff[i] = count_copy[i];
             }
           }
         },
         cpu_thread_idx));
   }
   for (auto& child : pos_threads) {
     child.get();
   }
   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
   std::vector<std::future<void>> rowid_threads;
   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     rowid_threads.push_back(std::async(
         std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));
   }
 
   for (auto& child : rowid_threads) {
     child.get();
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void fill_one_to_many_hash_table_sharded	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn &	join_column,
		const JoinColumnTypeInfo &	type_info,
		const ShardInfo &	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_count
	)

Definition at line 1715 of file HashJoinRuntime.cpp.

References count_matches_sharded(), fill_one_to_many_hash_table_sharded_impl(), fill_row_ids_sharded(), and SUFFIX.

                                                                          {
   auto launch_count_matches = [count_buff = buff + hash_entry_count,
                                &join_column,
                                &type_info,
                                &shard_info
 #ifndef __CUDACC__
                                ,
                                sd_inner_to_outer_translation_map,
                                min_inner_elem
 #endif
   ](auto cpu_thread_idx, auto cpu_thread_count) {
     return SUFFIX(count_matches_sharded)(count_buff,
                                          join_column,
                                          type_info,
                                          shard_info
 #ifndef __CUDACC__
                                          ,
                                          sd_inner_to_outer_translation_map,
                                          min_inner_elem,
                                          cpu_thread_idx,
                                          cpu_thread_count
 #endif
     );
   };
 
   auto launch_fill_row_ids = [buff,
                               hash_entry_count,
                               &join_column,
                               &type_info,
                               &shard_info
 #ifndef __CUDACC__
                               ,
                               sd_inner_to_outer_translation_map,
                               min_inner_elem
 #endif
   ](auto cpu_thread_idx, auto cpu_thread_count) {
     return SUFFIX(fill_row_ids_sharded)(buff,
                                         hash_entry_count,
                                         join_column,
                                         type_info,
                                         shard_info
 #ifndef __CUDACC__
                                         ,
                                         sd_inner_to_outer_translation_map,
                                         min_inner_elem,
                                         cpu_thread_idx,
                                         cpu_thread_count);
 #endif
   };
 
   fill_one_to_many_hash_table_sharded_impl(buff,
                                            hash_entry_count,
                                            join_column,
                                            type_info,
                                            shard_info
 #ifndef __CUDACC__
                                            ,
                                            sd_inner_to_outer_translation_map,
                                            min_inner_elem,
                                            cpu_thread_count
 #endif
                                            ,
                                            launch_count_matches,
                                            launch_fill_row_ids);
 }

Here is the call graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >

void fill_one_to_many_hash_table_sharded_impl	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn &	join_column,
		const JoinColumnTypeInfo &	type_info,
		const ShardInfo &	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_count,
		COUNT_MATCHES_LAUNCH_FUNCTOR	count_matches_launcher,
		FILL_ROW_IDS_LAUNCH_FUNCTOR	fill_row_ids_launcher
	)

Definition at line 1656 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, and inclusive_scan().

Referenced by fill_one_to_many_hash_table_sharded().

                                                        {
   auto timer = DEBUG_TIMER(__func__);
   int32_t* pos_buff = buff;
   int32_t* count_buff = buff + hash_entry_count;
   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
   std::vector<std::future<void>> counter_threads;
   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     counter_threads.push_back(std::async(
         std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));
   }
 
   for (auto& child : counter_threads) {
     child.get();
   }
 
   std::vector<int32_t> count_copy(hash_entry_count, 0);
   CHECK_GT(hash_entry_count, int64_t(0));
   memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
   ::inclusive_scan(
       count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
   std::vector<std::future<void>> pos_threads;
   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     pos_threads.push_back(std::async(
         std::launch::async,
         [&](const int32_t thread_idx) {
           for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
             if (count_buff[i]) {
               pos_buff[i] = count_copy[i];
             }
           }
         },
         cpu_thread_idx));
   }
   for (auto& child : pos_threads) {
     child.get();
   }
 
   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
   std::vector<std::future<void>> rowid_threads;
   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
     rowid_threads.push_back(std::async(
         std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));
   }
 
   for (auto& child : rowid_threads) {
     child.get();
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const bool	for_window_framing,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1027 of file HashJoinRuntime.cpp.

References fill_row_ids_for_window_framing_impl(), fill_row_ids_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

   {
   auto slot_sel = [&type_info](auto pos_buff, auto elem) {
     return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);
   };
 
   if (!for_window_framing) {
     fill_row_ids_impl(buff,
                       hash_entry_count,
                       join_column,
                       type_info
 #ifndef __CUDACC__
                       ,
                       sd_inner_to_outer_translation_map,
                       min_inner_elem,
                       cpu_thread_idx,
                       cpu_thread_count
 #endif
                       ,
                       slot_sel);
   } else {
     fill_row_ids_for_window_framing_impl(buff,
                                          hash_entry_count,
                                          join_column,
                                          type_info
 #ifndef __CUDACC__
                                          ,
                                          sd_inner_to_outer_translation_map,
                                          min_inner_elem,
                                          cpu_thread_idx,
                                          cpu_thread_count
 #endif
                                          ,
                                          slot_sel);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >

GLOBAL void SUFFIX() fill_row_ids_baseline	(	int32_t *	buff,
		const T *	composite_key_dict,
		const int64_t	hash_entry_count,
		const KEY_HANDLER *	f,
		const int64_t	num_elems,
		const bool	for_window_framing,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1251 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and heavydb.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

   {
   int32_t* pos_buff = buff;
   int32_t* count_buff = buff + hash_entry_count;
   int32_t* id_buff = count_buff + hash_entry_count;
   int32_t* reversed_id_buff = for_window_framing ? id_buff + num_elems : nullptr;
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
 
   T key_scratch_buff[g_maximum_conditions_to_coalesce];
 #ifdef __CUDACC__
   assert(composite_key_dict);
 #endif
   const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
   auto key_buff_handler = [composite_key_dict,
                            hash_entry_count,
                            pos_buff,
                            count_buff,
                            id_buff,
                            reversed_id_buff,
                            key_size_in_bytes,
                            for_window_framing](const int64_t row_index,
                                                const T* key_scratch_buff,
                                                const size_t key_component_count) {
     const T* matching_group =
         SUFFIX(get_matching_baseline_hash_slot_readonly)(key_scratch_buff,
                                                          key_component_count,
                                                          composite_key_dict,
                                                          hash_entry_count,
                                                          key_size_in_bytes);
     const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
     int32_t* pos_ptr = pos_buff + entry_idx;
     const auto bin_idx = pos_ptr - pos_buff;
     const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
     id_buff[id_buff_idx] = static_cast<int32_t>(row_index);
     if (for_window_framing) {
       reversed_id_buff[row_index] = id_buff_idx;
     }
     return 0;
   };
 
   JoinColumnTuple cols(
       f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
   for (auto& it : cols.slice(start, step)) {
     (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
   }
   return;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_bucketized	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		const int64_t	bucket_normalization
	)

Definition at line 1075 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

                                         {
   auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {
     return SUFFIX(get_bucketized_hash_slot)(pos_buff,
                                             elem,
                                             type_info.min_val / bucket_normalization,
                                             type_info.translated_null_val,
                                             bucket_normalization);
   };
 
   fill_row_ids_impl(buff,
                     hash_entry_count,
                     join_column,
                     type_info
 #ifndef __CUDACC__
                     ,
                     sd_inner_to_outer_translation_map,
                     min_inner_elem,
                     cpu_thread_idx,
                     cpu_thread_count
 #endif
                     ,
                     slot_sel);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename SLOT_SELECTOR >

DEVICE void fill_row_ids_for_window_framing_impl	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		SLOT_SELECTOR	slot_selector
	)

Definition at line 951 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, and JoinColumn::num_elems.

Referenced by fill_row_ids().

                                  {
   int32_t* pos_buff = buff;
   int32_t* count_buff = buff + hash_entry_count;
   int32_t* id_buff = count_buff + hash_entry_count;
   int32_t* reversed_id_buff = id_buff + join_column.num_elems;
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 
 #endif
   if (join_column.num_elems == 0) {
     return;
   }
   JoinColumnTyped col{&join_column, &type_info};
   bool all_nulls = hash_entry_count == 1 && type_info.min_val == 0 &&
                    type_info.max_val == -1 &&
                    (*col.begin()).element == type_info.null_val;
   if (all_nulls) {
     int32_t thread_idx = -1;
 #ifdef __CUDACC__
     thread_idx = threadIdx.x;
 #else
     thread_idx = cpu_thread_idx;
 #endif
     if (thread_idx == 0) {
       pos_buff[0] = 0;
       count_buff[0] = join_column.num_elems - 1;
       for (size_t i = 0; i < join_column.num_elems; i++) {
         reversed_id_buff[i] = i;
       }
     }
     return;
   }
   for (auto item : col.slice(start, step)) {
     const size_t index = item.index;
     int64_t elem = item.element;
     if (elem == type_info.null_val) {
       elem = type_info.translated_null_val;
     }
 #ifndef __CUDACC__
     if (sd_inner_to_outer_translation_map && elem != type_info.translated_null_val) {
       const auto outer_id = map_str_id_to_outer_dict(elem,
                                                      min_inner_elem,
                                                      type_info.min_val,
                                                      type_info.max_val,
                                                      sd_inner_to_outer_translation_map);
       if (outer_id == StringDictionary::INVALID_STR_ID) {
         continue;
       }
       elem = outer_id;
     }
 #endif
     auto pos_ptr = slot_selector(pos_buff, elem);
     const auto bin_idx = pos_ptr - pos_buff;
     auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
     id_buff[id_buff_idx] = static_cast<int32_t>(index);
     reversed_id_buff[index] = id_buff_idx;
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename SLOT_SELECTOR >

DEVICE void fill_row_ids_impl	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		SLOT_SELECTOR	slot_selector
	)

Definition at line 895 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by fill_row_ids(), fill_row_ids_bucketized(), fill_row_ids_sharded(), and fill_row_ids_sharded_bucketized().

                                                            {
   int32_t* pos_buff = buff;
   int32_t* count_buff = buff + hash_entry_count;
   int32_t* id_buff = count_buff + hash_entry_count;
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   JoinColumnTyped col{&join_column, &type_info};
   for (auto item : col.slice(start, step)) {
     const size_t index = item.index;
     int64_t elem = item.element;
     if (elem == type_info.null_val) {
       if (type_info.uses_bw_eq) {
         elem = type_info.translated_null_val;
       } else {
         continue;
       }
     }
 #ifndef __CUDACC__
     if (sd_inner_to_outer_translation_map &&
         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
       const auto outer_id = map_str_id_to_outer_dict(elem,
                                                      min_inner_elem,
                                                      type_info.min_val,
                                                      type_info.max_val,
                                                      sd_inner_to_outer_translation_map);
       if (outer_id == StringDictionary::INVALID_STR_ID) {
         continue;
       }
       elem = outer_id;
     }
 #endif
     auto pos_ptr = slot_selector(pos_buff, elem);
     const auto bin_idx = pos_ptr - pos_buff;
     const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
     id_buff[id_buff_idx] = static_cast<int32_t>(index);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const ShardInfo	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1171 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot_sharded(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

   {
   auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {
     return SUFFIX(get_hash_slot_sharded)(pos_buff,
                                          elem,
                                          type_info.min_val,
                                          shard_info.entry_count_per_shard,
                                          shard_info.num_shards,
                                          shard_info.device_count);
   };
   fill_row_ids_impl(buff,
                     hash_entry_count,
                     join_column,
                     type_info
 #ifndef __CUDACC__
                     ,
                     sd_inner_to_outer_translation_map,
                     min_inner_elem,
                     cpu_thread_idx,
                     cpu_thread_count
 #endif
                     ,
                     slot_sel);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const ShardInfo	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		const int64_t	bucket_normalization
	)

Definition at line 1207 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot_sharded(), and SUFFIX.

                                         {
   auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,
                                                                   auto elem) {
     return SUFFIX(get_bucketized_hash_slot_sharded)(
         pos_buff,
         elem,
         type_info.min_val / bucket_normalization,
         type_info.translated_null_val,
         shard_info.entry_count_per_shard,
         shard_info.num_shards,
         shard_info.device_count,
         bucket_normalization);
   };
 
   fill_row_ids_impl(buff,
                     hash_entry_count,
                     join_column,
                     type_info
 #ifndef __CUDACC__
                     ,
                     sd_inner_to_outer_translation_map,
                     min_inner_elem,
                     cpu_thread_idx,
                     cpu_thread_count
 #endif
                     ,
                     slot_sel);
 }

Here is the call graph for this function:

template<typename SLOT_SELECTOR >

DEVICE void fill_row_ids_sharded_impl	(	int32_t *	buff,
		const int64_t	hash_entry_count,
		const JoinColumn	join_column,
		const JoinColumnTypeInfo	type_info,
		const ShardInfo	shard_info,
		const int32_t *	sd_inner_to_outer_translation_map,
		const int32_t	min_inner_elem,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count,
		SLOT_SELECTOR	slot_selector
	)

Definition at line 1113 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

                                                                    {
 
   int32_t* pos_buff = buff;
   int32_t* count_buff = buff + hash_entry_count;
   int32_t* id_buff = count_buff + hash_entry_count;
 
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   JoinColumnTyped col{&join_column, &type_info};
   for (auto item : col.slice(start, step)) {
     const size_t index = item.index;
     int64_t elem = item.element;
     if (elem == type_info.null_val) {
       if (type_info.uses_bw_eq) {
         elem = type_info.translated_null_val;
       } else {
         continue;
       }
     }
 #ifndef __CUDACC__
     if (sd_inner_to_outer_translation_map &&
         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
       const auto outer_id = map_str_id_to_outer_dict(elem,
                                                      min_inner_elem,
                                                      type_info.min_val,
                                                      type_info.max_val,
                                                      sd_inner_to_outer_translation_map);
       if (outer_id == StringDictionary::INVALID_STR_ID) {
         continue;
       }
       elem = outer_id;
     }
 #endif
     auto* pos_ptr = slot_selector(pos_buff, elem);
     const auto bin_idx = pos_ptr - pos_buff;
     const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
     id_buff[id_buff_idx] = static_cast<int32_t>(index);
   }
 }

Here is the call graph for this function:

template<typename T >

T* get_matching_baseline_hash_slot_at	(	int8_t *	hash_buff,
		const uint32_t	h,
		const T *	key,
		const size_t	key_component_count,
		const int64_t	hash_entry_size
	)

Definition at line 466 of file HashJoinRuntime.cpp.

References cas_cst, get_invalid_key(), load_cst, store_cst, SUFFIX, heavydb.dtypes::T, and UNLIKELY.

Referenced by write_baseline_hash_slot(), and write_baseline_hash_slot_for_semi_join().

                                                                      {
   uint32_t off = h * hash_entry_size;
   auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
   T empty_key = SUFFIX(get_invalid_key)<T>();
   T write_pending = SUFFIX(get_invalid_key)<T>() - 1;
   if (UNLIKELY(*key == write_pending)) {
     // Address the singularity case where the first column contains the pending
     // write special value. Should never happen, but avoid doing wrong things.
     return nullptr;
   }
   const bool success = cas_cst(row_ptr, &empty_key, write_pending);
   if (success) {
     if (key_component_count > 1) {
       memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));
     }
     store_cst(row_ptr, *key);
     return reinterpret_cast<T*>(row_ptr + key_component_count);
   }
   while (load_cst(row_ptr) == write_pending) {
     // spin until the winning thread has finished writing the entire key
   }
   for (size_t i = 0; i < key_component_count; ++i) {
     if (load_cst(row_ptr + i) != key[i]) {
       return nullptr;
     }
   }
   return reinterpret_cast<T*>(row_ptr + key_component_count);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

DEVICE NEVER_INLINE const T* SUFFIX() get_matching_baseline_hash_slot_readonly	(	const T *	key,
		const size_t	key_component_count,
		const T *	composite_key_dict,
		const int64_t	entry_count,
		const size_t	key_size_in_bytes
	)

Definition at line 819 of file HashJoinRuntime.cpp.

References CHECK, keys_are_equal(), and MurmurHash1Impl().

Referenced by count_matches_baseline(), and fill_row_ids_baseline().

                                     {
   const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
   uint32_t off = h * key_component_count;
   if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
     return &composite_key_dict[off];
   }
   uint32_t h_probe = (h + 1) % entry_count;
   while (h_probe != h) {
     off = h_probe * key_component_count;
     if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
       return &composite_key_dict[off];
     }
     h_probe = (h_probe + 1) % entry_count;
   }
 #ifndef __CUDACC__
   CHECK(false);
 #else
   assert(false);
 #endif
   return nullptr;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename InputIterator , typename OutputIterator >

void inclusive_scan	(	InputIterator	first,
		InputIterator	last,
		OutputIterator	out,
		const size_t	thread_count
	)

Definition at line 1431 of file HashJoinRuntime.cpp.

References threading_serial::async(), and gpu_enabled::partial_sum().

Referenced by approximate_distinct_tuples_bbox_intersect(), approximate_distinct_tuples_on_device_bbox_intersect(), approximate_distinct_tuples_on_device_range(), approximate_distinct_tuples_range(), fill_one_to_many_baseline_hash_table(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_impl(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), fill_one_to_many_hash_table_sharded_impl(), and gpu_enabled::partial_sum().

                                                {
   using ElementType = typename InputIterator::value_type;
   using OffsetType = typename InputIterator::difference_type;
   const OffsetType elem_count = last - first;
   if (elem_count < 10000 || thread_count <= 1) {
     ElementType sum = 0;
     for (auto iter = first; iter != last; ++iter, ++out) {
       *out = sum += *iter;
     }
     return;
   }
 
   const OffsetType step = (elem_count + thread_count - 1) / thread_count;
   OffsetType start_off = 0;
   OffsetType end_off = std::min(step, elem_count);
   std::vector<ElementType> partial_sums(thread_count);
   std::vector<std::future<void>> counter_threads;
   for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,
               start_off = std::min(start_off + step, elem_count),
               end_off = std::min(start_off + step, elem_count)) {
     counter_threads.push_back(std::async(
         std::launch::async,
         [first, out](
             ElementType& partial_sum, const OffsetType start, const OffsetType end) {
           ElementType sum = 0;
           for (auto in_iter = first + start, out_iter = out + start;
                in_iter != (first + end);
                ++in_iter, ++out_iter) {
             *out_iter = sum += *in_iter;
           }
           partial_sum = sum;
         },
         std::ref(partial_sums[thread_idx]),
         start_off,
         end_off));
   }
   for (auto& child : counter_threads) {
     child.get();
   }
 
   ElementType sum = 0;
   for (auto& s : partial_sums) {
     s += sum;
     sum = s;
   }
 
   counter_threads.clear();
   start_off = std::min(step, elem_count);
   end_off = std::min(start_off + step, elem_count);
   for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,
               start_off = std::min(start_off + step, elem_count),
               end_off = std::min(start_off + step, elem_count)) {
     counter_threads.push_back(std::async(
         std::launch::async,
         [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {
           for (auto iter = out + start; iter != (out + end); ++iter) {
             *iter += prev_sum;
           }
         },
         partial_sums[thread_idx],
         start_off,
         end_off));
   }
   for (auto& child : counter_threads) {
     child.get();
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

DEVICE void SUFFIX() init_baseline_hash_join_buff	(	int8_t *	hash_buff,
		const int64_t	entry_count,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const int32_t	invalid_slot_val,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 347 of file HashJoinRuntime.cpp.

References get_invalid_key(), SUFFIX, and heavydb.dtypes::T.

Referenced by init_baseline_hash_join_buff_wrapper().

                                                                                  {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
   const T empty_key = SUFFIX(get_invalid_key)<T>();
   for (int64_t h = start; h < entry_count; h += step) {
     int64_t off = h * hash_entry_size;
     auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
     for (size_t i = 0; i < key_component_count; ++i) {
       row_ptr[i] = empty_key;
     }
     if (with_val_slot) {
       row_ptr[key_component_count] = invalid_slot_val;
     }
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void init_baseline_hash_join_buff_32	(	int8_t *	hash_join_buff,
		const int64_t	entry_count,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const int32_t	invalid_slot_val,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1788 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

                                                                      {
   init_baseline_hash_join_buff<int32_t>(hash_join_buff,
                                         entry_count,
                                         key_component_count,
                                         with_val_slot,
                                         invalid_slot_val,
                                         cpu_thread_idx,
                                         cpu_thread_count);
 }

Here is the caller graph for this function:

void init_baseline_hash_join_buff_64	(	int8_t *	hash_join_buff,
		const int64_t	entry_count,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const int32_t	invalid_slot_val,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1804 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

                                                                      {
   init_baseline_hash_join_buff<int64_t>(hash_join_buff,
                                         entry_count,
                                         key_component_count,
                                         with_val_slot,
                                         invalid_slot_val,
                                         cpu_thread_idx,
                                         cpu_thread_count);
 }

Here is the caller graph for this function:

DEVICE void SUFFIX() init_hash_join_buff	(	int32_t *	groups_buffer,
		const int64_t	hash_entry_count,
		const int32_t	invalid_slot_val,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 71 of file HashJoinRuntime.cpp.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

                                                                         {
 #ifdef __CUDACC__
   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
   int32_t step = blockDim.x * gridDim.x;
 #else
   int32_t start = cpu_thread_idx;
   int32_t step = cpu_thread_count;
 #endif
   for (int64_t i = start; i < hash_entry_count; i += step) {
     groups_buffer[i] = invalid_slot_val;
   }
 }

Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_32	(	int8_t *	hash_buff,
		const size_t	entry_count,
		const int32_t	invalid_slot_val,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const RangeKeyHandler *	key_handler,
		const size_t	num_elems,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1888 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

                                                                           {
   return fill_baseline_hash_join_buff<int32_t>(hash_buff,
                                                entry_count,
                                                invalid_slot_val,
                                                false,
                                                key_component_count,
                                                with_val_slot,
                                                key_handler,
                                                num_elems,
                                                cpu_thread_idx,
                                                cpu_thread_count);
 }

Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64	(	int8_t *	hash_buff,
		const size_t	entry_count,
		const int32_t	invalid_slot_val,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const RangeKeyHandler *	key_handler,
		const size_t	num_elems,
		const int32_t	cpu_thread_idx,
		const int32_t	cpu_thread_count
	)

Definition at line 1953 of file HashJoinRuntime.cpp.

                                                                           {
   return fill_baseline_hash_join_buff<int64_t>(hash_buff,
                                                entry_count,
                                                invalid_slot_val,
                                                false,
                                                key_component_count,
                                                with_val_slot,
                                                key_handler,
                                                num_elems,
                                                cpu_thread_idx,
                                                cpu_thread_count);
 }

template<typename T >

DEVICE int write_baseline_hash_slot	(	const int32_t	val,
		int8_t *	hash_buff,
		const int64_t	entry_count,
		const T *	key,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const int32_t	invalid_slot_val,
		const size_t	key_size_in_bytes,
		const size_t	hash_entry_size
	)

Definition at line 506 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and heavydb.dtypes::T.

                                                                   {
   const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
   T* matching_group = get_matching_baseline_hash_slot_at(
       hash_buff, h, key, key_component_count, hash_entry_size);
   if (!matching_group) {
     uint32_t h_probe = (h + 1) % entry_count;
     while (h_probe != h) {
       matching_group = get_matching_baseline_hash_slot_at(
           hash_buff, h_probe, key, key_component_count, hash_entry_size);
       if (matching_group) {
         break;
       }
       h_probe = (h_probe + 1) % entry_count;
     }
   }
   if (!matching_group) {
     return -2;
   }
   if (!with_val_slot) {
     return 0;
   }
   if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {
     return -1;
   }
   return 0;
 }

Here is the call graph for this function:

template<typename T >

DEVICE int write_baseline_hash_slot_for_semi_join	(	const int32_t	val,
		int8_t *	hash_buff,
		const int64_t	entry_count,
		const T *	key,
		const size_t	key_component_count,
		const bool	with_val_slot,
		const int32_t	invalid_slot_val,
		const size_t	key_size_in_bytes,
		const size_t	hash_entry_size
	)

Definition at line 542 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and heavydb.dtypes::T.

                                                                                 {
   const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
   T* matching_group = get_matching_baseline_hash_slot_at(
       hash_buff, h, key, key_component_count, hash_entry_size);
   if (!matching_group) {
     uint32_t h_probe = (h + 1) % entry_count;
     while (h_probe != h) {
       matching_group = get_matching_baseline_hash_slot_at(
           hash_buff, h_probe, key, key_component_count, hash_entry_size);
       if (matching_group) {
         break;
       }
       h_probe = (h_probe + 1) % entry_count;
     }
   }
   if (!matching_group) {
     return -2;
   }
   if (!with_val_slot) {
     return 0;
   }
   mapd_cas(matching_group, invalid_slot_val, val);
   return 0;
 }

Here is the call graph for this function:

Namespaces

Macros

Functions

Macro Definition Documentation

Function Documentation