OmniSciDB  467d548b97
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HashJoinRuntime.cpp File Reference
+ Include dependency graph for HashJoinRuntime.cpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

 anonymous_namespace{HashJoinRuntime.cpp}
 

Macros

#define mapd_cas(address, compare, val)   __sync_val_compare_and_swap(address, compare, val)
 
#define cas_cst(ptr, expected, desired)
 
#define store_cst(ptr, val)   __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)
 
#define load_cst(ptr)   __atomic_load_n(ptr, __ATOMIC_SEQ_CST)
 
#define mapd_add(address, val)   __sync_fetch_and_add(address, val)
 

Functions

int64_t anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict (const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
 
DEVICE void SUFFIX() init_hash_join_buff (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE auto fill_hash_join_buff_impl (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
 
DEVICE int SUFFIX() fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE int fill_hash_join_buff_sharded_impl (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
T * get_matching_baseline_hash_slot_at (int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() count_matches (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() count_matches_bucketized (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
GLOBAL void SUFFIX() count_matches_sharded (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE NEVER_INLINE const T
*SUFFIX() 
get_matching_baseline_hash_slot_readonly (const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline (int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_bucketized (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids_sharded (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl (uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl (double *bucket_sizes_for_thread, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double bucket_sz_threshold, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename InputIterator , typename OutputIterator >
void inclusive_scan (InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
 
void fill_one_to_many_hash_table_sharded (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
void fill_one_to_many_baseline_hash_table (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const size_t cpu_thread_count)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void compute_bucket_sizes (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const double bucket_size_threshold, const int thread_count)
 

Macro Definition Documentation

#define cas_cst (   ptr,
  expected,
  desired 
)
Value:
__atomic_compare_exchange_n( \
ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

Definition at line 396 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define load_cst (   ptr)    __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

Definition at line 400 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define mapd_add (   address,
  val 
)    __sync_fetch_and_add(address, val)
#define mapd_cas (   address,
  compare,
  val 
)    __sync_val_compare_and_swap(address, compare, val)
#define store_cst (   ptr,
  val 
)    __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

Definition at line 399 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 1939 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

1944  {
1945  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
1946  CHECK(!join_column_per_key.empty());
1947 
1948  std::vector<std::future<void>> approx_distinct_threads;
1949  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
1950  approx_distinct_threads.push_back(std::async(
1951  std::launch::async,
1952  [&join_column_per_key,
1953  &type_info_per_key,
1954  b,
1955  hll_buffer_all_cpus,
1956  padded_size_bytes,
1957  thread_idx,
1958  thread_count] {
1959  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
1960 
1961  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
1962  false,
1963  &join_column_per_key[0],
1964  &type_info_per_key[0],
1965  nullptr,
1966  nullptr);
1968  nullptr,
1969  b,
1970  join_column_per_key[0].num_elems,
1971  &key_handler,
1972  thread_idx,
1973  thread_count);
1974  }));
1975  }
1976  for (auto& child : approx_distinct_threads) {
1977  child.get();
1978  }
1979 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl ( uint8_t *  hll_buffer,
int32_t *  row_count_buffer,
const uint32_t  b,
const int64_t  num_elems,
const KEY_HANDLER *  f,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1114 of file HashJoinRuntime.cpp.

References atomicMax(), g_maximum_conditions_to_coalesce, get_rank(), and MurmurHash64AImpl().

Referenced by approximate_distinct_tuples(), and approximate_distinct_tuples_overlaps().

1124  {
1125 #ifdef __CUDACC__
1126  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1127  int32_t step = blockDim.x * gridDim.x;
1128 #else
1129  int32_t start = cpu_thread_idx;
1130  int32_t step = cpu_thread_count;
1131 #endif
1132 
1133  auto key_buff_handler = [b, hll_buffer, row_count_buffer](
1134  const int64_t entry_idx,
1135  const int64_t* key_scratch_buff,
1136  const size_t key_component_count) {
1137  if (row_count_buffer) {
1138  row_count_buffer[entry_idx] += 1;
1139  }
1140 
1141  const uint64_t hash =
1142  MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);
1143  const uint32_t index = hash >> (64 - b);
1144  const auto rank = get_rank(hash << b, 64 - b);
1145 #ifdef __CUDACC__
1146  atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);
1147 #else
1148  hll_buffer[index] = std::max(hll_buffer[index], rank);
1149 #endif
1150 
1151  return 0;
1152  };
1153 
1154  int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];
1155 
1156  JoinColumnTuple cols(
1157  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1158  for (auto& it : cols.slice(start, step)) {
1159  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1160  }
1161 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
__device__ double atomicMax(double *address, double val)
const size_t g_maximum_conditions_to_coalesce
FORCE_INLINE DEVICE uint64_t MurmurHash64AImpl(const void *key, int len, uint64_t seed)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 1981 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

1989  {
1990  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
1991  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
1992  CHECK(!join_column_per_key.empty());
1993 
1994  std::vector<std::future<void>> approx_distinct_threads;
1995  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
1996  approx_distinct_threads.push_back(std::async(
1997  std::launch::async,
1998  [&join_column_per_key,
1999  &join_buckets_per_key,
2000  &row_counts,
2001  b,
2002  hll_buffer_all_cpus,
2003  padded_size_bytes,
2004  thread_idx,
2005  thread_count] {
2006  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2007 
2008  const auto key_handler = OverlapsKeyHandler(
2009  join_buckets_per_key[0].bucket_sizes_for_dimension.size(),
2010  &join_column_per_key[0],
2011  join_buckets_per_key[0].bucket_sizes_for_dimension.data());
2013  row_counts.data(),
2014  b,
2015  join_column_per_key[0].num_elems,
2016  &key_handler,
2017  thread_idx,
2018  thread_count);
2019  }));
2020  }
2021  for (auto& child : approx_distinct_threads) {
2022  child.get();
2023  }
2024 
2026  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2027 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double  bucket_size_threshold,
const int  thread_count 
)

Definition at line 2029 of file HashJoinRuntime.cpp.

Referenced by OverlapsJoinHashTable::computeBucketSizes().

2033  {
2034  std::vector<std::vector<double>> bucket_sizes_for_threads;
2035  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2036  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(),
2037  std::numeric_limits<double>::max());
2038  }
2039  std::vector<std::future<void>> threads;
2040  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2041  threads.push_back(std::async(std::launch::async,
2042  compute_bucket_sizes_impl<2>,
2043  bucket_sizes_for_threads[thread_idx].data(),
2044  &join_column,
2045  &type_info,
2046  bucket_size_threshold,
2047  thread_idx,
2048  thread_count));
2049  }
2050  for (auto& child : threads) {
2051  child.get();
2052  }
2053 
2054  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2055  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2056  if (bucket_sizes_for_threads[thread_idx][i] < bucket_sizes_for_dimension[i]) {
2057  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2058  }
2059  }
2060  }
2061 }

+ Here is the caller graph for this function:

template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl ( double *  bucket_sizes_for_thread,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double  bucket_sz_threshold,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1184 of file HashJoinRuntime.cpp.

References atomicMin(), fixed_width_double_decode_noinline(), JoinColumnIterator::ptr(), and SUFFIX.

1189  {
1190 #ifdef __CUDACC__
1191  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1192  int32_t step = blockDim.x * gridDim.x;
1193 #else
1194  int32_t start = cpu_thread_idx;
1195  int32_t step = cpu_thread_count;
1196 #endif
1197  JoinColumnIterator it(join_column, type_info, start, step);
1198  for (; it; ++it) {
1199  // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)
1200  double bounds[2 * N];
1201  for (size_t j = 0; j < 2 * N; j++) {
1202  bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);
1203  }
1204 
1205  for (size_t j = 0; j < N; j++) {
1206  const auto diff = bounds[j + N] - bounds[j];
1207 #ifdef __CUDACC__
1208  if (diff > bucket_sz_threshold) {
1209  atomicMin(&bucket_sizes_for_thread[j], diff);
1210  }
1211 #else
1212  if (diff > bucket_sz_threshold && diff < bucket_sizes_for_thread[j]) {
1213  bucket_sizes_for_thread[j] = diff;
1214  }
1215 #endif
1216  }
1217  }
1218 }
__device__ double atomicMin(double *address, double val)
#define SUFFIX(name)
Iterates over the rows of a JoinColumn across multiple fragments/chunks.
DEVICE NEVER_INLINE double SUFFIX() fixed_width_double_decode_noinline(const int8_t *byte_stream, const int64_t pos)
Definition: DecodersImpl.h:126

+ Here is the call graph for this function:

GLOBAL void SUFFIX() count_matches ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 587 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

598  {
599  auto slot_sel = [&type_info](auto count_buff, auto elem) {
600  return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);
601  };
602  count_matches_impl(count_buff,
603  invalid_slot_val,
604  join_column,
605  type_info
606 #ifndef __CUDACC__
607  ,
608  sd_inner_proxy,
609  sd_outer_proxy,
610  cpu_thread_idx,
611  cpu_thread_count
612 #endif
613  ,
614  slot_sel);
615 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:39
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline ( int32_t *  count_buff,
const T *  composite_key_dict,
const int64_t  entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 731 of file HashJoinRuntime.cpp.

References assert(), g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, and SUFFIX.

Referenced by fill_one_to_many_baseline_hash_table().

741  {
742 #ifdef __CUDACC__
743  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
744  int32_t step = blockDim.x * gridDim.x;
745 #else
746  int32_t start = cpu_thread_idx;
747  int32_t step = cpu_thread_count;
748 #endif
749 #ifdef __CUDACC__
750  assert(composite_key_dict);
751 #endif
752  T key_scratch_buff[g_maximum_conditions_to_coalesce];
753  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
754  auto key_buff_handler = [composite_key_dict,
755  entry_count,
756  count_buff,
757  key_size_in_bytes](const int64_t row_entry_idx,
758  const T* key_scratch_buff,
759  const size_t key_component_count) {
760  const auto matching_group =
762  key_component_count,
763  composite_key_dict,
764  entry_count,
765  key_size_in_bytes);
766  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
767  mapd_add(&count_buff[entry_idx], int32_t(1));
768  return 0;
769  };
770 
771  JoinColumnTuple cols(
772  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
773  for (auto& it : cols.slice(start, step)) {
774  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
775  }
776 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
int64_t const int32_t sz assert(dest)
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_bucketized ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 617 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

629  {
630  auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {
632  count_buff, elem, type_info.min_val, bucket_normalization);
633  };
634  count_matches_impl(count_buff,
635  invalid_slot_val,
636  join_column,
637  type_info
638 #ifndef __CUDACC__
639  ,
640  sd_inner_proxy,
641  sd_outer_proxy,
642  cpu_thread_idx,
643  cpu_thread_count
644 #endif
645  ,
646  slot_sel);
647 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:31
#define SUFFIX(name)
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 539 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, mapd_add, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by count_matches(), and count_matches_bucketized().

551  {
552 #ifdef __CUDACC__
553  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
554  int32_t step = blockDim.x * gridDim.x;
555 #else
556  int32_t start = cpu_thread_idx;
557  int32_t step = cpu_thread_count;
558 #endif
559  JoinColumnTyped col{&join_column, &type_info};
560  for (auto item : col.slice(start, step)) {
561  int64_t elem = item.element;
562  if (elem == type_info.null_val) {
563  if (type_info.uses_bw_eq) {
564  elem = type_info.translated_null_val;
565  } else {
566  continue;
567  }
568  }
569 #ifndef __CUDACC__
570  if (sd_inner_proxy &&
571  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
572  const auto outer_id = translate_str_id_to_outer_dict(
573  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
574  if (outer_id == StringDictionary::INVALID_STR_ID) {
575  continue;
576  }
577  elem = outer_id;
578  }
579  CHECK_GE(elem, type_info.min_val)
580  << "Element " << elem << " less than min val " << type_info.min_val;
581 #endif
582  auto* entry_ptr = slot_selector(count_buff, elem);
583  mapd_add(entry_ptr, int32_t(1));
584  }
585 }
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_sharded ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 649 of file HashJoinRuntime.cpp.

References CHECK_GE, get_hash_slot_sharded(), StringDictionary::INVALID_STR_ID, mapd_add, SUFFIX, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

661  {
662 #ifdef __CUDACC__
663  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
664  int32_t step = blockDim.x * gridDim.x;
665 #else
666  int32_t start = cpu_thread_idx;
667  int32_t step = cpu_thread_count;
668 #endif
669  JoinColumnTyped col{&join_column, &type_info};
670  for (auto item : col.slice(start, step)) {
671  int64_t elem = item.element;
672  if (elem == type_info.null_val) {
673  if (type_info.uses_bw_eq) {
674  elem = type_info.translated_null_val;
675  } else {
676  continue;
677  }
678  }
679 #ifndef __CUDACC__
680  if (sd_inner_proxy &&
681  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
682  const auto outer_id = translate_str_id_to_outer_dict(
683  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
684  if (outer_id == StringDictionary::INVALID_STR_ID) {
685  continue;
686  }
687  elem = outer_id;
688  }
689  CHECK_GE(elem, type_info.min_val)
690  << "Element " << elem << " less than min val " << type_info.min_val;
691 #endif
692  int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,
693  elem,
694  type_info.min_val,
695  shard_info.entry_count_per_shard,
696  shard_info.num_shards,
697  shard_info.device_count);
698  mapd_add(entry_ptr, int32_t(1));
699  }
700 }
const size_t num_shards
#define CHECK_GE(x, y)
Definition: Logger.h:210
#define SUFFIX(name)
const int device_count
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const size_t entry_count_per_shard
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
const int64_t max_val
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:60
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const FILL_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 479 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce.

Referenced by fill_baseline_hash_join_buff_wrapper().

487  {
488 #ifdef __CUDACC__
489  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
490  int32_t step = blockDim.x * gridDim.x;
491 #else
492  int32_t start = cpu_thread_idx;
493  int32_t step = cpu_thread_count;
494 #endif
495 
496  T key_scratch_buff[g_maximum_conditions_to_coalesce];
497  const size_t key_size_in_bytes = key_component_count * sizeof(T);
498  const size_t hash_entry_size =
499  (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
500  auto key_buff_handler = [hash_buff,
501  entry_count,
502  with_val_slot,
503  invalid_slot_val,
504  key_size_in_bytes,
505  hash_entry_size](const int64_t entry_idx,
506  const T* key_scratch_buffer,
507  const size_t key_component_count) {
508  return write_baseline_hash_slot<T>(entry_idx,
509  hash_buff,
510  entry_count,
511  key_scratch_buffer,
512  key_component_count,
513  with_val_slot,
514  invalid_slot_val,
515  key_size_in_bytes,
516  hash_entry_size);
517  };
518 
519  JoinColumnTuple cols(
520  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
521  for (auto& it : cols.slice(start, step)) {
522  const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
523  if (err) {
524  return err;
525  }
526  }
527  return 0;
528 }
const size_t g_maximum_conditions_to_coalesce

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1647 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTable::initHashTableOnCpu().

1655  {
1656  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1657  entry_count,
1658  invalid_slot_val,
1659  key_component_count,
1660  with_val_slot,
1661  key_handler,
1662  num_elems,
1663  cpu_thread_idx,
1664  cpu_thread_count);
1665 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1687 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTable::initHashTableOnCpu().

1695  {
1696  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1697  entry_count,
1698  invalid_slot_val,
1699  key_component_count,
1700  with_val_slot,
1701  key_handler,
1702  num_elems,
1703  cpu_thread_idx,
1704  cpu_thread_count);
1705 }

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 188 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

195  {
196  auto slot_selector = [&](auto elem) {
197  return SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
198  };
199  return fill_hash_join_buff_impl(buff,
200  invalid_slot_val,
201  join_column,
202  type_info,
203  sd_inner_proxy,
204  sd_outer_proxy,
205  cpu_thread_idx,
206  cpu_thread_count,
207  slot_selector);
208 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:39
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 164 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and JoinHashTable::initOneToOneHashTableOnCpu().

172  {
173  auto slot_selector = [&](auto elem) {
175  buff, elem, type_info.min_val, bucket_normalization);
176  };
177  return fill_hash_join_buff_impl(buff,
178  invalid_slot_val,
179  join_column,
180  type_info,
181  sd_inner_proxy,
182  sd_outer_proxy,
183  cpu_thread_idx,
184  cpu_thread_count,
185  slot_selector);
186 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:31
#define SUFFIX(name)
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE auto fill_hash_join_buff_impl ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_sel 
)

Definition at line 116 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, mapd_cas, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff(), and fill_hash_join_buff_bucketized().

124  {
125 #ifdef __CUDACC__
126  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
127  int32_t step = blockDim.x * gridDim.x;
128 #else
129  int32_t start = cpu_thread_idx;
130  int32_t step = cpu_thread_count;
131 #endif
132  JoinColumnTyped col{&join_column, &type_info};
133  for (auto item : col.slice(start, step)) {
134  const size_t index = item.index;
135  int64_t elem = item.element;
136  if (elem == type_info.null_val) {
137  if (type_info.uses_bw_eq) {
138  elem = type_info.translated_null_val;
139  } else {
140  continue;
141  }
142  }
143 #ifndef __CUDACC__
144  if (sd_inner_proxy &&
145  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
146  const auto outer_id = translate_str_id_to_outer_dict(
147  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
148  if (outer_id == StringDictionary::INVALID_STR_ID) {
149  continue;
150  }
151  elem = outer_id;
152  }
153  CHECK_GE(elem, type_info.min_val)
154  << "Element " << elem << " less than min val " << type_info.min_val;
155 #endif
156  int32_t* entry_ptr = slot_sel(elem);
157  if (mapd_cas(entry_ptr, invalid_slot_val, index) != invalid_slot_val) {
158  return -1;
159  }
160  }
161  return 0;
162 };
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define mapd_cas(address, compare, val)
const int64_t max_val
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 298 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), get_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded().

306  {
307  auto slot_selector = [&](auto elem, auto shard) {
308  return SUFFIX(get_hash_slot_sharded_opt)(buff,
309  elem,
310  type_info.min_val,
311  shard_info.entry_count_per_shard,
312  shard,
313  shard_info.num_shards,
314  shard_info.device_count);
315  };
317  invalid_slot_val,
318  join_column,
319  type_info,
320  shard_info,
321  sd_inner_proxy,
322  sd_outer_proxy,
323  cpu_thread_idx,
324  cpu_thread_count,
325  slot_selector);
326 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:89
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 264 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), get_bucketized_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded_bucketized().

274  {
275  auto slot_selector = [&](auto elem, auto shard) -> auto {
277  elem,
278  type_info.min_val,
279  shard_info.entry_count_per_shard,
280  shard,
281  shard_info.num_shards,
282  shard_info.device_count,
283  bucket_normalization);
284  };
285 
287  invalid_slot_val,
288  join_column,
289  type_info,
290  shard_info,
291  sd_inner_proxy,
292  sd_outer_proxy,
293  cpu_thread_idx,
294  cpu_thread_count,
295  slot_selector);
296 }
const size_t num_shards
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:74
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE int fill_hash_join_buff_sharded_impl ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_sel 
)

Definition at line 211 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, mapd_cas, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, ShardInfo::num_shards, ShardInfo::shard, SHARD_FOR_KEY, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_sharded(), and fill_hash_join_buff_sharded_bucketized().

220  {
221 #ifdef __CUDACC__
222  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
223  int32_t step = blockDim.x * gridDim.x;
224 #else
225  int32_t start = cpu_thread_idx;
226  int32_t step = cpu_thread_count;
227 #endif
228  JoinColumnTyped col{&join_column, &type_info};
229  for (auto item : col.slice(start, step)) {
230  const size_t index = item.index;
231  int64_t elem = item.element;
232  size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);
233  if (shard != shard_info.shard) {
234  continue;
235  }
236  if (elem == type_info.null_val) {
237  if (type_info.uses_bw_eq) {
238  elem = type_info.translated_null_val;
239  } else {
240  continue;
241  }
242  }
243 #ifndef __CUDACC__
244  if (sd_inner_proxy &&
245  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
246  const auto outer_id = translate_str_id_to_outer_dict(
247  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
248  if (outer_id == StringDictionary::INVALID_STR_ID) {
249  continue;
250  }
251  elem = outer_id;
252  }
253  CHECK_GE(elem, type_info.min_val)
254  << "Element " << elem << " less than min val " << type_info.min_val;
255 #endif
256  int32_t* entry_ptr = slot_sel(elem, shard);
257  if (mapd_cas(entry_ptr, invalid_slot_val, index) != invalid_slot_val) {
258  return -1;
259  }
260  }
261  return 0;
262 }
const size_t num_shards
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define mapd_cas(address, compare, val)
const size_t shard
const int64_t max_val
const int64_t min_val
#define SHARD_FOR_KEY(key, num_shards)
Definition: shard_key.h:20

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
void fill_one_to_many_baseline_hash_table ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const size_t  cpu_thread_count 
)

Definition at line 1728 of file HashJoinRuntime.cpp.

References CHECK_GT, count_matches_baseline(), fill_row_ids_baseline(), inclusive_scan(), and SUFFIX.

1739  {
1740  int32_t* pos_buff = buff;
1741  int32_t* count_buff = buff + hash_entry_count;
1742  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1743  std::vector<std::future<void>> counter_threads;
1744  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1745  if (join_buckets_per_key.size() > 0) {
1746  counter_threads.push_back(
1747  std::async(std::launch::async,
1748  [count_buff,
1749  composite_key_dict,
1750  &hash_entry_count,
1751  &join_buckets_per_key,
1752  &join_column_per_key,
1753  cpu_thread_idx,
1754  cpu_thread_count] {
1755  const auto key_handler = OverlapsKeyHandler(
1756  join_buckets_per_key[0].bucket_sizes_for_dimension.size(),
1757  &join_column_per_key[0],
1758  join_buckets_per_key[0].bucket_sizes_for_dimension.data());
1759  count_matches_baseline(count_buff,
1760  composite_key_dict,
1761  hash_entry_count,
1762  &key_handler,
1763  join_column_per_key[0].num_elems,
1764  cpu_thread_idx,
1765  cpu_thread_count);
1766  }));
1767  } else {
1768  counter_threads.push_back(std::async(
1769  std::launch::async,
1770  [count_buff,
1771  composite_key_dict,
1772  &key_component_count,
1773  &hash_entry_count,
1774  &join_column_per_key,
1775  &type_info_per_key,
1776  &sd_inner_proxy_per_key,
1777  &sd_outer_proxy_per_key,
1778  cpu_thread_idx,
1779  cpu_thread_count] {
1780  const auto key_handler = GenericKeyHandler(key_component_count,
1781  true,
1782  &join_column_per_key[0],
1783  &type_info_per_key[0],
1784  &sd_inner_proxy_per_key[0],
1785  &sd_outer_proxy_per_key[0]);
1786  count_matches_baseline(count_buff,
1787  composite_key_dict,
1788  hash_entry_count,
1789  &key_handler,
1790  join_column_per_key[0].num_elems,
1791  cpu_thread_idx,
1792  cpu_thread_count);
1793  }));
1794  }
1795  }
1796 
1797  for (auto& child : counter_threads) {
1798  child.get();
1799  }
1800 
1801  std::vector<int32_t> count_copy(hash_entry_count, 0);
1802  CHECK_GT(hash_entry_count, int64_t(0));
1803  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1805  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1806  std::vector<std::future<void>> pos_threads;
1807  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1808  pos_threads.push_back(std::async(
1809  std::launch::async,
1810  [&](const int thread_idx) {
1811  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1812  if (count_buff[i]) {
1813  pos_buff[i] = count_copy[i];
1814  }
1815  }
1816  },
1817  cpu_thread_idx));
1818  }
1819  for (auto& child : pos_threads) {
1820  child.get();
1821  }
1822 
1823  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1824  std::vector<std::future<void>> rowid_threads;
1825  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1826  if (join_buckets_per_key.size() > 0) {
1827  rowid_threads.push_back(
1828  std::async(std::launch::async,
1829  [buff,
1830  composite_key_dict,
1831  hash_entry_count,
1832  invalid_slot_val,
1833  &join_column_per_key,
1834  &join_buckets_per_key,
1835  cpu_thread_idx,
1836  cpu_thread_count] {
1837  const auto key_handler = OverlapsKeyHandler(
1838  join_buckets_per_key[0].bucket_sizes_for_dimension.size(),
1839  &join_column_per_key[0],
1840  join_buckets_per_key[0].bucket_sizes_for_dimension.data());
1842  (buff,
1843  composite_key_dict,
1844  hash_entry_count,
1845  invalid_slot_val,
1846  &key_handler,
1847  join_column_per_key[0].num_elems,
1848  cpu_thread_idx,
1849  cpu_thread_count);
1850  }));
1851  } else {
1852  rowid_threads.push_back(std::async(std::launch::async,
1853  [buff,
1854  composite_key_dict,
1855  hash_entry_count,
1856  invalid_slot_val,
1857  key_component_count,
1858  &join_column_per_key,
1859  &type_info_per_key,
1860  &sd_inner_proxy_per_key,
1861  &sd_outer_proxy_per_key,
1862  cpu_thread_idx,
1863  cpu_thread_count] {
1864  const auto key_handler = GenericKeyHandler(
1865  key_component_count,
1866  true,
1867  &join_column_per_key[0],
1868  &type_info_per_key[0],
1869  &sd_inner_proxy_per_key[0],
1870  &sd_outer_proxy_per_key[0]);
1872  (buff,
1873  composite_key_dict,
1874  hash_entry_count,
1875  invalid_slot_val,
1876  &key_handler,
1877  join_column_per_key[0].num_elems,
1878  cpu_thread_idx,
1879  cpu_thread_count);
1880  }));
1881  }
1882  }
1883 
1884  for (auto& child : rowid_threads) {
1885  child.get();
1886  }
1887 }
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_baseline(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:209
GLOBAL void SUFFIX() count_matches_baseline(int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count 
)

Definition at line 1889 of file HashJoinRuntime.cpp.

Referenced by OverlapsJoinHashTable::initHashTableOnCpu(), and BaselineJoinHashTable::initHashTableOnCpu().

1900  {
1901  fill_one_to_many_baseline_hash_table<int32_t>(buff,
1902  composite_key_dict,
1903  hash_entry_count,
1904  invalid_slot_val,
1905  key_component_count,
1906  join_column_per_key,
1907  type_info_per_key,
1908  join_bucket_info,
1909  sd_inner_proxy_per_key,
1910  sd_outer_proxy_per_key,
1911  cpu_thread_count);
1912 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count 
)

Definition at line 1914 of file HashJoinRuntime.cpp.

Referenced by OverlapsJoinHashTable::initHashTableOnCpu(), and BaselineJoinHashTable::initHashTableOnCpu().

1925  {
1926  fill_one_to_many_baseline_hash_table<int64_t>(buff,
1927  composite_key_dict,
1928  hash_entry_count,
1929  invalid_slot_val,
1930  key_component_count,
1931  join_column_per_key,
1932  type_info_per_key,
1933  join_bucket_info,
1934  sd_inner_proxy_per_key,
1935  sd_outer_proxy_per_key,
1936  cpu_thread_count);
1937 }

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1356 of file HashJoinRuntime.cpp.

References count_matches(), fill_one_to_many_hash_table_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

Referenced by JoinHashTable::initOneToManyHashTableOnCpu().

1363  {
1364  auto launch_count_matches = [count_buff = buff + hash_entry_info.hash_entry_count,
1365  invalid_slot_val,
1366  &join_column,
1367  &type_info,
1368  sd_inner_proxy,
1369  sd_outer_proxy](auto cpu_thread_idx,
1370  auto cpu_thread_count) {
1372  (count_buff,
1373  invalid_slot_val,
1374  join_column,
1375  type_info,
1376  sd_inner_proxy,
1377  sd_outer_proxy,
1378  cpu_thread_idx,
1379  cpu_thread_count);
1380  };
1381  auto launch_fill_row_ids = [hash_entry_count = hash_entry_info.hash_entry_count,
1382  buff,
1383  invalid_slot_val,
1384  &join_column,
1385  &type_info,
1386  sd_inner_proxy,
1387  sd_outer_proxy](auto cpu_thread_idx,
1388  auto cpu_thread_count) {
1390  (buff,
1391  hash_entry_count,
1392  invalid_slot_val,
1393  join_column,
1394  type_info,
1395  sd_inner_proxy,
1396  sd_outer_proxy,
1397  cpu_thread_idx,
1398  cpu_thread_count);
1399  };
1400 
1402  hash_entry_info.hash_entry_count,
1403  invalid_slot_val,
1404  join_column,
1405  type_info,
1406  sd_inner_proxy,
1407  sd_outer_proxy,
1408  cpu_thread_count,
1409  launch_count_matches,
1410  launch_fill_row_ids);
1411 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
size_t hash_entry_count
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1413 of file HashJoinRuntime.cpp.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by JoinHashTable::initOneToManyHashTableOnCpu().

1420  {
1421  auto bucket_normalization = hash_entry_info.bucket_normalization;
1422  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1423  auto launch_count_matches = [bucket_normalization,
1424  count_buff = buff + hash_entry_count,
1425  invalid_slot_val,
1426  &join_column,
1427  &type_info,
1428  sd_inner_proxy,
1429  sd_outer_proxy](auto cpu_thread_idx,
1430  auto cpu_thread_count) {
1432  (count_buff,
1433  invalid_slot_val,
1434  join_column,
1435  type_info,
1436  sd_inner_proxy,
1437  sd_outer_proxy,
1438  cpu_thread_idx,
1439  cpu_thread_count,
1440  bucket_normalization);
1441  };
1442  auto launch_fill_row_ids = [bucket_normalization,
1443  hash_entry_count,
1444  buff,
1445  invalid_slot_val,
1446  &join_column,
1447  &type_info,
1448  sd_inner_proxy,
1449  sd_outer_proxy](auto cpu_thread_idx,
1450  auto cpu_thread_count) {
1452  (buff,
1453  hash_entry_count,
1454  invalid_slot_val,
1455  join_column,
1456  type_info,
1457  sd_inner_proxy,
1458  sd_outer_proxy,
1459  cpu_thread_idx,
1460  cpu_thread_count,
1461  bucket_normalization);
1462  };
1463 
1465  hash_entry_count,
1466  invalid_slot_val,
1467  join_column,
1468  type_info,
1469  sd_inner_proxy,
1470  sd_outer_proxy,
1471  cpu_thread_count,
1472  launch_count_matches,
1473  launch_fill_row_ids);
1474 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_func,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_func 
)

Definition at line 1295 of file HashJoinRuntime.cpp.

References CHECK_GT, and inclusive_scan().

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_bucketized().

1304  {
1305  int32_t* pos_buff = buff;
1306  int32_t* count_buff = buff + hash_entry_count;
1307  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1308  std::vector<std::future<void>> counter_threads;
1309  for (unsigned cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1310  counter_threads.push_back(std::async(
1311  std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));
1312  }
1313 
1314  for (auto& child : counter_threads) {
1315  child.get();
1316  }
1317 
1318  std::vector<int32_t> count_copy(hash_entry_count, 0);
1319  CHECK_GT(hash_entry_count, int64_t(0));
1320  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1321 #if HAVE_CUDA
1322  thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());
1323 #else
1325  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1326 #endif
1327  std::vector<std::future<void>> pos_threads;
1328  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1329  pos_threads.push_back(std::async(
1330  std::launch::async,
1331  [&](size_t thread_idx) {
1332  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1333  if (count_buff[i]) {
1334  pos_buff[i] = count_copy[i];
1335  }
1336  }
1337  },
1338  cpu_thread_idx));
1339  }
1340  for (auto& child : pos_threads) {
1341  child.get();
1342  }
1343 
1344  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1345  std::vector<std::future<void>> rowid_threads;
1346  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1347  rowid_threads.push_back(std::async(
1348  std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));
1349  }
1350 
1351  for (auto& child : rowid_threads) {
1352  child.get();
1353  }
1354 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1536 of file HashJoinRuntime.cpp.

References count_matches_sharded(), fill_one_to_many_hash_table_sharded_impl(), fill_row_ids_sharded(), and SUFFIX.

1544  {
1545  auto launch_count_matches = [count_buff = buff + hash_entry_count,
1546  invalid_slot_val,
1547  &join_column,
1548  &type_info,
1549  &shard_info
1550 #ifndef __CUDACC__
1551  ,
1552  sd_inner_proxy,
1553  sd_outer_proxy
1554 #endif
1555  ](auto cpu_thread_idx, auto cpu_thread_count) {
1556  return SUFFIX(count_matches_sharded)(count_buff,
1557  invalid_slot_val,
1558  join_column,
1559  type_info,
1560  shard_info
1561 #ifndef __CUDACC__
1562  ,
1563  sd_inner_proxy,
1564  sd_outer_proxy,
1565  cpu_thread_idx,
1566  cpu_thread_count
1567 #endif
1568  );
1569  };
1570 
1571  auto launch_fill_row_ids = [buff,
1572  hash_entry_count,
1573  invalid_slot_val,
1574  &join_column,
1575  &type_info,
1576  &shard_info
1577 #ifndef __CUDACC__
1578  ,
1579  sd_inner_proxy,
1580  sd_outer_proxy
1581 #endif
1582  ](auto cpu_thread_idx, auto cpu_thread_count) {
1583  return SUFFIX(fill_row_ids_sharded)(buff,
1584  hash_entry_count,
1585  invalid_slot_val,
1586  join_column,
1587  type_info,
1588  shard_info
1589 #ifndef __CUDACC__
1590  ,
1591  sd_inner_proxy,
1592  sd_outer_proxy,
1593  cpu_thread_idx,
1594  cpu_thread_count);
1595 #endif
1596  };
1597 
1599  hash_entry_count,
1600  invalid_slot_val,
1601  join_column,
1602  type_info,
1603  shard_info
1604 #ifndef __CUDACC__
1605  ,
1606  sd_inner_proxy,
1607  sd_outer_proxy,
1608  cpu_thread_count
1609 #endif
1610  ,
1611  launch_count_matches,
1612  launch_fill_row_ids);
1613 }
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_launcher,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_launcher 
)

Definition at line 1477 of file HashJoinRuntime.cpp.

References CHECK_GT, and inclusive_scan().

Referenced by fill_one_to_many_hash_table_sharded().

1488  {
1489  int32_t* pos_buff = buff;
1490  int32_t* count_buff = buff + hash_entry_count;
1491  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1492  std::vector<std::future<void>> counter_threads;
1493  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1494  counter_threads.push_back(std::async(
1495  std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));
1496  }
1497 
1498  for (auto& child : counter_threads) {
1499  child.get();
1500  }
1501 
1502  std::vector<int32_t> count_copy(hash_entry_count, 0);
1503  CHECK_GT(hash_entry_count, int64_t(0));
1504  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1506  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1507  std::vector<std::future<void>> pos_threads;
1508  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1509  pos_threads.push_back(std::async(
1510  std::launch::async,
1511  [&](const unsigned thread_idx) {
1512  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1513  if (count_buff[i]) {
1514  pos_buff[i] = count_copy[i];
1515  }
1516  }
1517  },
1518  cpu_thread_idx));
1519  }
1520  for (auto& child : pos_threads) {
1521  child.get();
1522  }
1523 
1524  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1525  std::vector<std::future<void>> rowid_threads;
1526  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1527  rowid_threads.push_back(std::async(
1528  std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));
1529  }
1530 
1531  for (auto& child : rowid_threads) {
1532  child.get();
1533  }
1534 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 838 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

850  {
851  auto slot_sel = [&type_info](auto pos_buff, auto elem) {
852  return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);
853  };
854 
855  fill_row_ids_impl(buff,
856  hash_entry_count,
857  invalid_slot_val,
858  join_column,
859  type_info
860 #ifndef __CUDACC__
861  ,
862  sd_inner_proxy,
863  sd_outer_proxy,
864  cpu_thread_idx,
865  cpu_thread_count
866 #endif
867  ,
868  slot_sel);
869 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:39
const int64_t min_val
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1049 of file HashJoinRuntime.cpp.

References assert(), CHECK_NE, g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, and SUFFIX.

Referenced by fill_one_to_many_baseline_hash_table().

1060  {
1061  int32_t* pos_buff = buff;
1062  int32_t* count_buff = buff + hash_entry_count;
1063  int32_t* id_buff = count_buff + hash_entry_count;
1064 #ifdef __CUDACC__
1065  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1066  int32_t step = blockDim.x * gridDim.x;
1067 #else
1068  int32_t start = cpu_thread_idx;
1069  int32_t step = cpu_thread_count;
1070 #endif
1071 
1072  T key_scratch_buff[g_maximum_conditions_to_coalesce];
1073 #ifdef __CUDACC__
1074  assert(composite_key_dict);
1075 #endif
1076  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
1077  auto key_buff_handler = [composite_key_dict,
1078  hash_entry_count,
1079  pos_buff,
1080  invalid_slot_val,
1081  count_buff,
1082  id_buff,
1083  key_size_in_bytes](const int64_t row_index,
1084  const T* key_scratch_buff,
1085  const size_t key_component_count) {
1086  const T* matching_group =
1088  key_component_count,
1089  composite_key_dict,
1090  hash_entry_count,
1091  key_size_in_bytes);
1092  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
1093  int32_t* pos_ptr = pos_buff + entry_idx;
1094 #ifndef __CUDACC__
1095  CHECK_NE(*pos_ptr, invalid_slot_val);
1096 #endif
1097  const auto bin_idx = pos_ptr - pos_buff;
1098  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1099  id_buff[id_buff_idx] = static_cast<int32_t>(row_index);
1100  return 0;
1101  };
1102 
1103  JoinColumnTuple cols(
1104  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1105  for (auto& it : cols.slice(start, step)) {
1106  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1107  }
1108  return;
1109 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
#define CHECK_NE(x, y)
Definition: Logger.h:206
int64_t const int32_t sz assert(dest)
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 871 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

884  {
885  auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {
887  pos_buff, elem, type_info.min_val, bucket_normalization);
888  };
889  fill_row_ids_impl(buff,
890  hash_entry_count,
891  invalid_slot_val,
892  join_column,
893  type_info
894 #ifndef __CUDACC__
895  ,
896  sd_inner_proxy,
897  sd_outer_proxy,
898  cpu_thread_idx,
899  cpu_thread_count
900 #endif
901  ,
902  slot_sel);
903 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:31
#define SUFFIX(name)
const int64_t min_val
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 779 of file HashJoinRuntime.cpp.

References CHECK_GE, CHECK_NE, StringDictionary::INVALID_STR_ID, mapd_add, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by fill_row_ids(), fill_row_ids_bucketized(), fill_row_ids_sharded(), and fill_row_ids_sharded_bucketized().

792  {
793  int32_t* pos_buff = buff;
794  int32_t* count_buff = buff + hash_entry_count;
795  int32_t* id_buff = count_buff + hash_entry_count;
796 
797 #ifdef __CUDACC__
798  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
799  int32_t step = blockDim.x * gridDim.x;
800 #else
801  int32_t start = cpu_thread_idx;
802  int32_t step = cpu_thread_count;
803 #endif
804  JoinColumnTyped col{&join_column, &type_info};
805  for (auto item : col.slice(start, step)) {
806  const size_t index = item.index;
807  int64_t elem = item.element;
808  if (elem == type_info.null_val) {
809  if (type_info.uses_bw_eq) {
810  elem = type_info.translated_null_val;
811  } else {
812  continue;
813  }
814  }
815 #ifndef __CUDACC__
816  if (sd_inner_proxy &&
817  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
818  const auto outer_id = translate_str_id_to_outer_dict(
819  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
820  if (outer_id == StringDictionary::INVALID_STR_ID) {
821  continue;
822  }
823  elem = outer_id;
824  }
825  CHECK_GE(elem, type_info.min_val)
826  << "Element " << elem << " less than min val " << type_info.min_val;
827 #endif
828  auto pos_ptr = slot_selector(pos_buff, elem);
829 #ifndef __CUDACC__
830  CHECK_NE(*pos_ptr, invalid_slot_val);
831 #endif
832  const auto bin_idx = pos_ptr - pos_buff;
833  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
834  id_buff[id_buff_idx] = static_cast<int32_t>(index);
835  }
836 }
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define CHECK_NE(x, y)
Definition: Logger.h:206
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 967 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot_sharded(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

980  {
981  auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {
982  return SUFFIX(get_hash_slot_sharded)(pos_buff,
983  elem,
984  type_info.min_val,
985  shard_info.entry_count_per_shard,
986  shard_info.num_shards,
987  shard_info.device_count);
988  };
989 
990  fill_row_ids_impl(buff,
991  hash_entry_count,
992  invalid_slot_val,
993  join_column,
994  type_info
995 #ifndef __CUDACC__
996  ,
997  sd_inner_proxy,
998  sd_outer_proxy,
999  cpu_thread_idx,
1000  cpu_thread_count
1001 #endif
1002  ,
1003  slot_sel);
1004 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:60
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 1006 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot_sharded(), and SUFFIX.

1020  {
1021  auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,
1022  auto elem) {
1023  return SUFFIX(get_bucketized_hash_slot_sharded)(pos_buff,
1024  elem,
1025  type_info.min_val,
1026  shard_info.entry_count_per_shard,
1027  shard_info.num_shards,
1028  shard_info.device_count,
1029  bucket_normalization);
1030  };
1031 
1032  fill_row_ids_impl(buff,
1033  hash_entry_count,
1034  invalid_slot_val,
1035  join_column,
1036  type_info
1037 #ifndef __CUDACC__
1038  ,
1039  sd_inner_proxy,
1040  sd_outer_proxy,
1041  cpu_thread_idx,
1042  cpu_thread_count
1043 #endif
1044  ,
1045  slot_sel);
1046 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:45
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 906 of file HashJoinRuntime.cpp.

References CHECK_GE, CHECK_NE, StringDictionary::INVALID_STR_ID, mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

920  {
921 
922  int32_t* pos_buff = buff;
923  int32_t* count_buff = buff + hash_entry_count;
924  int32_t* id_buff = count_buff + hash_entry_count;
925 
926 #ifdef __CUDACC__
927  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
928  int32_t step = blockDim.x * gridDim.x;
929 #else
930  int32_t start = cpu_thread_idx;
931  int32_t step = cpu_thread_count;
932 #endif
933  JoinColumnTyped col{&join_column, &type_info};
934  for (auto item : col.slice(start, step)) {
935  const size_t index = item.index;
936  int64_t elem = item.element;
937  if (elem == type_info.null_val) {
938  if (type_info.uses_bw_eq) {
939  elem = type_info.translated_null_val;
940  } else {
941  continue;
942  }
943  }
944 #ifndef __CUDACC__
945  if (sd_inner_proxy &&
946  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
947  const auto outer_id = translate_str_id_to_outer_dict(
948  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
949  if (outer_id == StringDictionary::INVALID_STR_ID) {
950  continue;
951  }
952  elem = outer_id;
953  }
954  CHECK_GE(elem, type_info.min_val)
955  << "Element " << elem << " less than min val " << type_info.min_val;
956 #endif
957  auto* pos_ptr = slot_selector(pos_buff, elem);
958 #ifndef __CUDACC__
959  CHECK_NE(*pos_ptr, invalid_slot_val);
960 #endif
961  const auto bin_idx = pos_ptr - pos_buff;
962  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
963  id_buff[id_buff_idx] = static_cast<int32_t>(index);
964  }
965 }
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define CHECK_NE(x, y)
Definition: Logger.h:206
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

template<typename T >
T* get_matching_baseline_hash_slot_at ( int8_t *  hash_buff,
const uint32_t  h,
const T *  key,
const size_t  key_component_count,
const int64_t  hash_entry_size 
)

Definition at line 403 of file HashJoinRuntime.cpp.

References cas_cst, get_invalid_key(), load_cst, store_cst, SUFFIX, and UNLIKELY.

Referenced by write_baseline_hash_slot().

407  {
408  uint32_t off = h * hash_entry_size;
409  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
410  T empty_key = SUFFIX(get_invalid_key)<T>();
411  T write_pending = SUFFIX(get_invalid_key)<T>() - 1;
412  if (UNLIKELY(*key == write_pending)) {
413  // Address the singularity case where the first column contains the pending
414  // write special value. Should never happen, but avoid doing wrong things.
415  return nullptr;
416  }
417  const bool success = cas_cst(row_ptr, &empty_key, write_pending);
418  if (success) {
419  if (key_component_count > 1) {
420  memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));
421  }
422  store_cst(row_ptr, *key);
423  return reinterpret_cast<T*>(row_ptr + key_component_count);
424  }
425  while (load_cst(row_ptr) == write_pending) {
426  // spin until the winning thread has finished writing the entire key
427  }
428  for (size_t i = 0; i < key_component_count; ++i) {
429  if (load_cst(row_ptr + i) != key[i]) {
430  return nullptr;
431  }
432  }
433  return reinterpret_cast<T*>(row_ptr + key_component_count);
434 }
#define SUFFIX(name)
#define load_cst(ptr)
DEVICE T SUFFIX() get_invalid_key()
#define cas_cst(ptr, expected, desired)
#define UNLIKELY(x)
Definition: likely.h:25
#define store_cst(ptr, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE NEVER_INLINE const T* SUFFIX() get_matching_baseline_hash_slot_readonly ( const T *  key,
const size_t  key_component_count,
const T *  composite_key_dict,
const int64_t  entry_count,
const size_t  key_size_in_bytes 
)

Definition at line 703 of file HashJoinRuntime.cpp.

References assert(), CHECK, keys_are_equal(), and MurmurHash1Impl().

Referenced by count_matches_baseline(), and fill_row_ids_baseline().

708  {
709  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
710  uint32_t off = h * key_component_count;
711  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
712  return &composite_key_dict[off];
713  }
714  uint32_t h_probe = (h + 1) % entry_count;
715  while (h_probe != h) {
716  off = h_probe * key_component_count;
717  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
718  return &composite_key_dict[off];
719  }
720  h_probe = (h_probe + 1) % entry_count;
721  }
722 #ifndef __CUDACC__
723  CHECK(false);
724 #else
725  assert(false);
726 #endif
727  return nullptr;
728 }
bool keys_are_equal(const T *key1, const T *key2, const size_t key_component_count)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
int64_t const int32_t sz assert(dest)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename InputIterator , typename OutputIterator >
void inclusive_scan ( InputIterator  first,
InputIterator  last,
OutputIterator  out,
const size_t  thread_count 
)

Definition at line 1223 of file HashJoinRuntime.cpp.

References out.

Referenced by approximate_distinct_tuples_on_device_overlaps(), approximate_distinct_tuples_overlaps(), fill_one_to_many_baseline_hash_table(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_impl(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded_impl().

1226  {
1227  using ElementType = typename InputIterator::value_type;
1228  using OffsetType = typename InputIterator::difference_type;
1229  const OffsetType elem_count = last - first;
1230  if (elem_count < 10000 || thread_count <= 1) {
1231  ElementType sum = 0;
1232  for (auto iter = first; iter != last; ++iter, ++out) {
1233  *out = sum += *iter;
1234  }
1235  return;
1236  }
1237 
1238  const OffsetType step = (elem_count + thread_count - 1) / thread_count;
1239  OffsetType start_off = 0;
1240  OffsetType end_off = std::min(step, elem_count);
1241  std::vector<ElementType> partial_sums(thread_count);
1242  std::vector<std::future<void>> counter_threads;
1243  for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,
1244  start_off = std::min(start_off + step, elem_count),
1245  end_off = std::min(start_off + step, elem_count)) {
1246  counter_threads.push_back(std::async(
1247  std::launch::async,
1248  [first, out](
1249  ElementType& partial_sum, const OffsetType start, const OffsetType end) {
1250  ElementType sum = 0;
1251  for (auto in_iter = first + start, out_iter = out + start;
1252  in_iter != (first + end);
1253  ++in_iter, ++out_iter) {
1254  *out_iter = sum += *in_iter;
1255  }
1256  partial_sum = sum;
1257  },
1258  std::ref(partial_sums[thread_idx]),
1259  start_off,
1260  end_off));
1261  }
1262  for (auto& child : counter_threads) {
1263  child.get();
1264  }
1265 
1266  ElementType sum = 0;
1267  for (auto& s : partial_sums) {
1268  s += sum;
1269  sum = s;
1270  }
1271 
1272  counter_threads.clear();
1273  start_off = std::min(step, elem_count);
1274  end_off = std::min(start_off + step, elem_count);
1275  for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,
1276  start_off = std::min(start_off + step, elem_count),
1277  end_off = std::min(start_off + step, elem_count)) {
1278  counter_threads.push_back(std::async(
1279  std::launch::async,
1280  [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {
1281  for (auto iter = out + start; iter != (out + end); ++iter) {
1282  *iter += prev_sum;
1283  }
1284  },
1285  partial_sums[thread_idx],
1286  start_off,
1287  end_off));
1288  }
1289  for (auto& child : counter_threads) {
1290  child.get();
1291  }
1292 }
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t ** out

+ Here is the caller graph for this function:

template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 329 of file HashJoinRuntime.cpp.

References get_invalid_key(), and SUFFIX.

Referenced by init_baseline_hash_join_buff_wrapper().

335  {
336 #ifdef __CUDACC__
337  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
338  int32_t step = blockDim.x * gridDim.x;
339 #else
340  int32_t start = cpu_thread_idx;
341  int32_t step = cpu_thread_count;
342 #endif
343  auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
344  const T empty_key = SUFFIX(get_invalid_key)<T>();
345  for (int64_t h = start; h < entry_count; h += step) {
346  int64_t off = h * hash_entry_size;
347  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
348  for (size_t i = 0; i < key_component_count; ++i) {
349  row_ptr[i] = empty_key;
350  }
351  if (with_val_slot) {
352  row_ptr[key_component_count] = invalid_slot_val;
353  }
354  }
355 }
#define SUFFIX(name)
DEVICE T SUFFIX() get_invalid_key()

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1615 of file HashJoinRuntime.cpp.

Referenced by OverlapsJoinHashTable::initHashTableOnCpu(), and BaselineJoinHashTable::initHashTableOnCpu().

1621  {
1622  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1623  entry_count,
1624  key_component_count,
1625  with_val_slot,
1626  invalid_slot_val,
1627  cpu_thread_idx,
1628  cpu_thread_count);
1629 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1631 of file HashJoinRuntime.cpp.

Referenced by OverlapsJoinHashTable::initHashTableOnCpu(), and BaselineJoinHashTable::initHashTableOnCpu().

1637  {
1638  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1639  entry_count,
1640  key_component_count,
1641  with_val_slot,
1642  invalid_slot_val,
1643  cpu_thread_idx,
1644  cpu_thread_count);
1645 }

+ Here is the caller graph for this function:

DEVICE void SUFFIX() init_hash_join_buff ( int32_t *  groups_buffer,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 92 of file HashJoinRuntime.cpp.

References groups_buffer.

Referenced by init_hash_join_buff_wrapper(), OverlapsJoinHashTable::initHashTableOnCpu(), BaselineJoinHashTable::initHashTableOnCpu(), JoinHashTable::initOneToManyHashTableOnCpu(), and JoinHashTable::initOneToOneHashTableOnCpu().

96  {
97 #ifdef __CUDACC__
98  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
99  int32_t step = blockDim.x * gridDim.x;
100 #else
101  int32_t start = cpu_thread_idx;
102  int32_t step = cpu_thread_count;
103 #endif
104  for (int64_t i = start; i < hash_entry_count; i += step) {
105  groups_buffer[i] = invalid_slot_val;
106  }
107 }
const int32_t groups_buffer_size return groups_buffer

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1667 of file HashJoinRuntime.cpp.

Referenced by OverlapsJoinHashTable::initHashTableOnCpu().

1675  {
1676  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1677  entry_count,
1678  invalid_slot_val,
1679  key_component_count,
1680  with_val_slot,
1681  key_handler,
1682  num_elems,
1683  cpu_thread_idx,
1684  cpu_thread_count);
1685 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1707 of file HashJoinRuntime.cpp.

Referenced by OverlapsJoinHashTable::initHashTableOnCpu().

1715  {
1716  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1717  entry_count,
1718  invalid_slot_val,
1719  key_component_count,
1720  with_val_slot,
1721  key_handler,
1722  num_elems,
1723  cpu_thread_idx,
1724  cpu_thread_count);
1725 }

+ Here is the caller graph for this function:

template<typename T >
DEVICE int write_baseline_hash_slot ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 443 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, and MurmurHash1Impl().

451  {
452  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
453  T* matching_group = get_matching_baseline_hash_slot_at(
454  hash_buff, h, key, key_component_count, hash_entry_size);
455  if (!matching_group) {
456  uint32_t h_probe = (h + 1) % entry_count;
457  while (h_probe != h) {
458  matching_group = get_matching_baseline_hash_slot_at(
459  hash_buff, h_probe, key, key_component_count, hash_entry_size);
460  if (matching_group) {
461  break;
462  }
463  h_probe = (h_probe + 1) % entry_count;
464  }
465  }
466  if (!matching_group) {
467  return -2;
468  }
469  if (!with_val_slot) {
470  return 0;
471  }
472  if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {
473  return -1;
474  }
475  return 0;
476 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function: