OmniSciDB  a667adc9c8
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HashJoinRuntime.cpp File Reference
+ Include dependency graph for HashJoinRuntime.cpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

 anonymous_namespace{HashJoinRuntime.cpp}
 

Macros

#define mapd_cas(address, compare, val)   __sync_val_compare_and_swap(address, compare, val)
 
#define cas_cst(ptr, expected, desired)
 
#define store_cst(ptr, val)   __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)
 
#define load_cst(ptr)   __atomic_load_n(ptr, __ATOMIC_SEQ_CST)
 
#define mapd_add(address, val)   __sync_fetch_and_add(address, val)
 

Functions

int64_t anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict (const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
 
DEVICE void SUFFIX() init_hash_join_buff (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE auto fill_hash_join_buff_impl (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
 
DEVICE int SUFFIX() fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE int fill_hash_join_buff_sharded_impl (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
T * get_matching_baseline_hash_slot_at (int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() count_matches (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() count_matches_bucketized (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
GLOBAL void SUFFIX() count_matches_sharded (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE NEVER_INLINE const T
*SUFFIX() 
get_matching_baseline_hash_slot_readonly (const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline (int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_bucketized (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids_sharded (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl (uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl (double *bucket_sizes_for_thread, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename InputIterator , typename OutputIterator >
void inclusive_scan (InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
 
void fill_one_to_many_hash_table_sharded (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
void fill_one_to_many_baseline_hash_table (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const size_t cpu_thread_count)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 

Macro Definition Documentation

#define cas_cst (   ptr,
  expected,
  desired 
)
Value:
__atomic_compare_exchange_n( \
ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

Definition at line 412 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define load_cst (   ptr)    __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

Definition at line 416 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define mapd_add (   address,
  val 
)    __sync_fetch_and_add(address, val)
#define mapd_cas (   address,
  compare,
  val 
)    __sync_val_compare_and_swap(address, compare, val)
#define store_cst (   ptr,
  val 
)    __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

Definition at line 415 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 1964 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

1969  {
1970  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
1971  CHECK(!join_column_per_key.empty());
1972 
1973  std::vector<std::future<void>> approx_distinct_threads;
1974  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
1975  approx_distinct_threads.push_back(std::async(
1976  std::launch::async,
1977  [&join_column_per_key,
1978  &type_info_per_key,
1979  b,
1980  hll_buffer_all_cpus,
1981  padded_size_bytes,
1982  thread_idx,
1983  thread_count] {
1984  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
1985 
1986  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
1987  false,
1988  &join_column_per_key[0],
1989  &type_info_per_key[0],
1990  nullptr,
1991  nullptr);
1993  nullptr,
1994  b,
1995  join_column_per_key[0].num_elems,
1996  &key_handler,
1997  thread_idx,
1998  thread_count);
1999  }));
2000  }
2001  for (auto& child : approx_distinct_threads) {
2002  child.get();
2003  }
2004 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl ( uint8_t *  hll_buffer,
int32_t *  row_count_buffer,
const uint32_t  b,
const int64_t  num_elems,
const KEY_HANDLER *  f,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1135 of file HashJoinRuntime.cpp.

References atomicMax(), g_maximum_conditions_to_coalesce, get_rank(), and MurmurHash64AImpl().

Referenced by approximate_distinct_tuples(), and approximate_distinct_tuples_overlaps().

1145  {
1146 #ifdef __CUDACC__
1147  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1148  int32_t step = blockDim.x * gridDim.x;
1149 #else
1150  int32_t start = cpu_thread_idx;
1151  int32_t step = cpu_thread_count;
1152 #endif
1153 
1154  auto key_buff_handler = [b, hll_buffer, row_count_buffer](
1155  const int64_t entry_idx,
1156  const int64_t* key_scratch_buff,
1157  const size_t key_component_count) {
1158  if (row_count_buffer) {
1159  row_count_buffer[entry_idx] += 1;
1160  }
1161 
1162  const uint64_t hash =
1163  MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);
1164  const uint32_t index = hash >> (64 - b);
1165  const auto rank = get_rank(hash << b, 64 - b);
1166 #ifdef __CUDACC__
1167  atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);
1168 #else
1169  hll_buffer[index] = std::max(hll_buffer[index], rank);
1170 #endif
1171 
1172  return 0;
1173  };
1174 
1175  int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];
1176 
1177  JoinColumnTuple cols(
1178  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1179  for (auto& it : cols.slice(start, step)) {
1180  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1181  }
1182 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
char * f
__device__ double atomicMax(double *address, double val)
const size_t g_maximum_conditions_to_coalesce
FORCE_INLINE DEVICE uint64_t MurmurHash64AImpl(const void *key, int len, uint64_t seed)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2006 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

2014  {
2015  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2016  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2017  CHECK(!join_column_per_key.empty());
2018 
2019  std::vector<std::future<void>> approx_distinct_threads;
2020  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2021  approx_distinct_threads.push_back(std::async(
2022  std::launch::async,
2023  [&join_column_per_key,
2024  &join_buckets_per_key,
2025  &row_counts,
2026  b,
2027  hll_buffer_all_cpus,
2028  padded_size_bytes,
2029  thread_idx,
2030  thread_count] {
2031  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2032 
2033  const auto key_handler = OverlapsKeyHandler(
2034  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2035  &join_column_per_key[0],
2036  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2038  row_counts.data(),
2039  b,
2040  join_column_per_key[0].num_elems,
2041  &key_handler,
2042  thread_idx,
2043  thread_count);
2044  }));
2045  }
2046  for (auto& child : approx_distinct_threads) {
2047  child.get();
2048  }
2049 
2051  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2052 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl ( double *  bucket_sizes_for_thread,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1205 of file HashJoinRuntime.cpp.

References atomicMin(), fixed_width_double_decode_noinline(), generate_TableFunctionsFactory_init::j, JoinColumnIterator::ptr(), and SUFFIX.

1214  {
1215 #ifdef __CUDACC__
1216  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1217  int32_t step = blockDim.x * gridDim.x;
1218 #else
1219  int32_t start = cpu_thread_idx;
1220  int32_t step = cpu_thread_count;
1221 #endif
1222  JoinColumnIterator it(join_column, type_info, start, step);
1223  for (; it; ++it) {
1224  // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)
1225  double bounds[2 * N];
1226  for (size_t j = 0; j < 2 * N; j++) {
1227  bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);
1228  }
1229 
1230  for (size_t j = 0; j < N; j++) {
1231  const auto diff = bounds[j + N] - bounds[j];
1232 #ifdef __CUDACC__
1233  if (diff > bucket_size_thresholds[j]) {
1234  atomicMin(&bucket_sizes_for_thread[j], diff);
1235  }
1236 #else
1237  if (diff < bucket_size_thresholds[j] && diff > bucket_sizes_for_thread[j]) {
1238  bucket_sizes_for_thread[j] = diff;
1239  }
1240 #endif
1241  }
1242  }
1243 }
__device__ double atomicMin(double *address, double val)
#define SUFFIX(name)
Iterates over the rows of a JoinColumn across multiple fragments/chunks.
DEVICE NEVER_INLINE double SUFFIX() fixed_width_double_decode_noinline(const int8_t *byte_stream, const int64_t pos)
Definition: DecodersImpl.h:134

+ Here is the call graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2054 of file HashJoinRuntime.cpp.

References i.

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

2058  {
2059  std::vector<std::vector<double>> bucket_sizes_for_threads;
2060  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2061  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2062  }
2063  std::vector<std::future<void>> threads;
2064  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2065  threads.push_back(std::async(std::launch::async,
2066  compute_bucket_sizes_impl<2>,
2067  bucket_sizes_for_threads[thread_idx].data(),
2068  &join_column,
2069  &type_info,
2070  bucket_size_thresholds.data(),
2071  thread_idx,
2072  thread_count));
2073  }
2074  for (auto& child : threads) {
2075  child.get();
2076  }
2077 
2078  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2079  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2080  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2081  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2082  }
2083  }
2084  }
2085 }

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 608 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

619  {
620  auto slot_sel = [&type_info](auto count_buff, auto elem) {
621  return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);
622  };
623  count_matches_impl(count_buff,
624  invalid_slot_val,
625  join_column,
626  type_info
627 #ifndef __CUDACC__
628  ,
629  sd_inner_proxy,
630  sd_outer_proxy,
631  cpu_thread_idx,
632  cpu_thread_count
633 #endif
634  ,
635  slot_sel);
636 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:39
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline ( int32_t *  count_buff,
const T *  composite_key_dict,
const int64_t  entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 752 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and omnisci.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

762  {
763 #ifdef __CUDACC__
764  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
765  int32_t step = blockDim.x * gridDim.x;
766 #else
767  int32_t start = cpu_thread_idx;
768  int32_t step = cpu_thread_count;
769 #endif
770 #ifdef __CUDACC__
771  assert(composite_key_dict);
772 #endif
773  T key_scratch_buff[g_maximum_conditions_to_coalesce];
774  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
775  auto key_buff_handler = [composite_key_dict,
776  entry_count,
777  count_buff,
778  key_size_in_bytes](const int64_t row_entry_idx,
779  const T* key_scratch_buff,
780  const size_t key_component_count) {
781  const auto matching_group =
783  key_component_count,
784  composite_key_dict,
785  entry_count,
786  key_size_in_bytes);
787  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
788  mapd_add(&count_buff[entry_idx], int32_t(1));
789  return 0;
790  };
791 
792  JoinColumnTuple cols(
793  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
794  for (auto& it : cols.slice(start, step)) {
795  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
796  }
797 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
char * f
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_bucketized ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 638 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

650  {
651  auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {
653  count_buff, elem, type_info.min_val, bucket_normalization);
654  };
655  count_matches_impl(count_buff,
656  invalid_slot_val,
657  join_column,
658  type_info
659 #ifndef __CUDACC__
660  ,
661  sd_inner_proxy,
662  sd_outer_proxy,
663  cpu_thread_idx,
664  cpu_thread_count
665 #endif
666  ,
667  slot_sel);
668 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:31
#define SUFFIX(name)
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 560 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, mapd_add, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by count_matches(), and count_matches_bucketized().

572  {
573 #ifdef __CUDACC__
574  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
575  int32_t step = blockDim.x * gridDim.x;
576 #else
577  int32_t start = cpu_thread_idx;
578  int32_t step = cpu_thread_count;
579 #endif
580  JoinColumnTyped col{&join_column, &type_info};
581  for (auto item : col.slice(start, step)) {
582  int64_t elem = item.element;
583  if (elem == type_info.null_val) {
584  if (type_info.uses_bw_eq) {
585  elem = type_info.translated_null_val;
586  } else {
587  continue;
588  }
589  }
590 #ifndef __CUDACC__
591  if (sd_inner_proxy &&
592  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
593  const auto outer_id = translate_str_id_to_outer_dict(
594  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
595  if (outer_id == StringDictionary::INVALID_STR_ID) {
596  continue;
597  }
598  elem = outer_id;
599  }
600  CHECK_GE(elem, type_info.min_val)
601  << "Element " << elem << " less than min val " << type_info.min_val;
602 #endif
603  auto* entry_ptr = slot_selector(count_buff, elem);
604  mapd_add(entry_ptr, int32_t(1));
605  }
606 }
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_sharded ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 670 of file HashJoinRuntime.cpp.

References CHECK_GE, get_hash_slot_sharded(), StringDictionary::INVALID_STR_ID, mapd_add, SUFFIX, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

682  {
683 #ifdef __CUDACC__
684  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
685  int32_t step = blockDim.x * gridDim.x;
686 #else
687  int32_t start = cpu_thread_idx;
688  int32_t step = cpu_thread_count;
689 #endif
690  JoinColumnTyped col{&join_column, &type_info};
691  for (auto item : col.slice(start, step)) {
692  int64_t elem = item.element;
693  if (elem == type_info.null_val) {
694  if (type_info.uses_bw_eq) {
695  elem = type_info.translated_null_val;
696  } else {
697  continue;
698  }
699  }
700 #ifndef __CUDACC__
701  if (sd_inner_proxy &&
702  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
703  const auto outer_id = translate_str_id_to_outer_dict(
704  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
705  if (outer_id == StringDictionary::INVALID_STR_ID) {
706  continue;
707  }
708  elem = outer_id;
709  }
710  CHECK_GE(elem, type_info.min_val)
711  << "Element " << elem << " less than min val " << type_info.min_val;
712 #endif
713  int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,
714  elem,
715  type_info.min_val,
716  shard_info.entry_count_per_shard,
717  shard_info.num_shards,
718  shard_info.device_count);
719  mapd_add(entry_ptr, int32_t(1));
720  }
721 }
const size_t num_shards
#define CHECK_GE(x, y)
Definition: Logger.h:210
#define SUFFIX(name)
const int device_count
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const size_t entry_count_per_shard
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
const int64_t max_val
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:60
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const FILL_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 496 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, and omnisci.dtypes::T.

504  {
505 #ifdef __CUDACC__
506  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
507  int32_t step = blockDim.x * gridDim.x;
508 #else
509  int32_t start = cpu_thread_idx;
510  int32_t step = cpu_thread_count;
511 #endif
512 
513  T key_scratch_buff[g_maximum_conditions_to_coalesce];
514  const size_t key_size_in_bytes = key_component_count * sizeof(T);
515  const size_t hash_entry_size =
516  (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
517  auto key_buff_handler = [hash_buff,
518  entry_count,
519  with_val_slot,
520  invalid_slot_val,
521  key_size_in_bytes,
522  hash_entry_size](const int64_t entry_idx,
523  const T* key_scratch_buffer,
524  const size_t key_component_count) {
525  return write_baseline_hash_slot<T>(entry_idx,
526  hash_buff,
527  entry_count,
528  key_scratch_buffer,
529  key_component_count,
530  with_val_slot,
531  invalid_slot_val,
532  key_size_in_bytes,
533  hash_entry_size);
534  };
535 
536  JoinColumnTuple cols(
537  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
538  for (auto& it : cols.slice(start, step)) {
539  const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
540  if (err) {
541  return err;
542  }
543  }
544  return 0;
545 }
char * f
const size_t g_maximum_conditions_to_coalesce
int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1672 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1680  {
1681  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1682  entry_count,
1683  invalid_slot_val,
1684  key_component_count,
1685  with_val_slot,
1686  key_handler,
1687  num_elems,
1688  cpu_thread_idx,
1689  cpu_thread_count);
1690 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1712 of file HashJoinRuntime.cpp.

1720  {
1721  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1722  entry_count,
1723  invalid_slot_val,
1724  key_component_count,
1725  with_val_slot,
1726  key_handler,
1727  num_elems,
1728  cpu_thread_idx,
1729  cpu_thread_count);
1730 }
DEVICE int SUFFIX() fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 193 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

200  {
201  auto slot_selector = [&](auto elem) {
202  return SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
203  };
204  return fill_hash_join_buff_impl(buff,
205  invalid_slot_val,
206  join_column,
207  type_info,
208  sd_inner_proxy,
209  sd_outer_proxy,
210  cpu_thread_idx,
211  cpu_thread_count,
212  slot_selector);
213 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:39
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 169 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

177  {
178  auto slot_selector = [&](auto elem) {
180  buff, elem, type_info.min_val, bucket_normalization);
181  };
182  return fill_hash_join_buff_impl(buff,
183  invalid_slot_val,
184  join_column,
185  type_info,
186  sd_inner_proxy,
187  sd_outer_proxy,
188  cpu_thread_idx,
189  cpu_thread_count,
190  slot_selector);
191 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:31
#define SUFFIX(name)
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE auto fill_hash_join_buff_impl ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_sel 
)

Definition at line 121 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, mapd_cas, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff(), and fill_hash_join_buff_bucketized().

129  {
130 #ifdef __CUDACC__
131  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
132  int32_t step = blockDim.x * gridDim.x;
133 #else
134  int32_t start = cpu_thread_idx;
135  int32_t step = cpu_thread_count;
136 #endif
137  JoinColumnTyped col{&join_column, &type_info};
138  for (auto item : col.slice(start, step)) {
139  const size_t index = item.index;
140  int64_t elem = item.element;
141  if (elem == type_info.null_val) {
142  if (type_info.uses_bw_eq) {
143  elem = type_info.translated_null_val;
144  } else {
145  continue;
146  }
147  }
148 #ifndef __CUDACC__
149  if (sd_inner_proxy &&
150  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
151  const auto outer_id = translate_str_id_to_outer_dict(
152  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
153  if (outer_id == StringDictionary::INVALID_STR_ID) {
154  continue;
155  }
156  elem = outer_id;
157  }
158  CHECK_GE(elem, type_info.min_val)
159  << "Element " << elem << " less than min val " << type_info.min_val;
160 #endif
161  int32_t* entry_ptr = slot_sel(elem);
162  if (mapd_cas(entry_ptr, invalid_slot_val, index) != invalid_slot_val) {
163  return -1;
164  }
165  }
166  return 0;
167 };
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define mapd_cas(address, compare, val)
const int64_t max_val
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 303 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), get_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded().

311  {
312  auto slot_selector = [&](auto elem, auto shard) {
313  return SUFFIX(get_hash_slot_sharded_opt)(buff,
314  elem,
315  type_info.min_val,
316  shard_info.entry_count_per_shard,
317  shard,
318  shard_info.num_shards,
319  shard_info.device_count);
320  };
322  invalid_slot_val,
323  join_column,
324  type_info,
325  shard_info,
326  sd_inner_proxy,
327  sd_outer_proxy,
328  cpu_thread_idx,
329  cpu_thread_count,
330  slot_selector);
331 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:89
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 269 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), get_bucketized_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded_bucketized().

279  {
280  auto slot_selector = [&](auto elem, auto shard) -> auto {
282  elem,
283  type_info.min_val,
284  shard_info.entry_count_per_shard,
285  shard,
286  shard_info.num_shards,
287  shard_info.device_count,
288  bucket_normalization);
289  };
290 
292  invalid_slot_val,
293  join_column,
294  type_info,
295  shard_info,
296  sd_inner_proxy,
297  sd_outer_proxy,
298  cpu_thread_idx,
299  cpu_thread_count,
300  slot_selector);
301 }
const size_t num_shards
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:74
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_sel)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE int fill_hash_join_buff_sharded_impl ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_sel 
)

Definition at line 216 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, mapd_cas, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, ShardInfo::num_shards, ShardInfo::shard, SHARD_FOR_KEY, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_sharded(), and fill_hash_join_buff_sharded_bucketized().

225  {
226 #ifdef __CUDACC__
227  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
228  int32_t step = blockDim.x * gridDim.x;
229 #else
230  int32_t start = cpu_thread_idx;
231  int32_t step = cpu_thread_count;
232 #endif
233  JoinColumnTyped col{&join_column, &type_info};
234  for (auto item : col.slice(start, step)) {
235  const size_t index = item.index;
236  int64_t elem = item.element;
237  size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);
238  if (shard != shard_info.shard) {
239  continue;
240  }
241  if (elem == type_info.null_val) {
242  if (type_info.uses_bw_eq) {
243  elem = type_info.translated_null_val;
244  } else {
245  continue;
246  }
247  }
248 #ifndef __CUDACC__
249  if (sd_inner_proxy &&
250  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
251  const auto outer_id = translate_str_id_to_outer_dict(
252  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
253  if (outer_id == StringDictionary::INVALID_STR_ID) {
254  continue;
255  }
256  elem = outer_id;
257  }
258  CHECK_GE(elem, type_info.min_val)
259  << "Element " << elem << " less than min val " << type_info.min_val;
260 #endif
261  int32_t* entry_ptr = slot_sel(elem, shard);
262  if (mapd_cas(entry_ptr, invalid_slot_val, index) != invalid_slot_val) {
263  return -1;
264  }
265  }
266  return 0;
267 }
const size_t num_shards
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define mapd_cas(address, compare, val)
const size_t shard
const int64_t max_val
const int64_t min_val
#define SHARD_FOR_KEY(key, num_shards)
Definition: shard_key.h:20

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
void fill_one_to_many_baseline_hash_table ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const size_t  cpu_thread_count 
)

Definition at line 1753 of file HashJoinRuntime.cpp.

References CHECK_GT, count_matches_baseline(), fill_row_ids_baseline(), i, inclusive_scan(), and SUFFIX.

1764  {
1765  int32_t* pos_buff = buff;
1766  int32_t* count_buff = buff + hash_entry_count;
1767  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1768  std::vector<std::future<void>> counter_threads;
1769  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1770  if (join_buckets_per_key.size() > 0) {
1771  counter_threads.push_back(std::async(
1772  std::launch::async,
1773  [count_buff,
1774  composite_key_dict,
1775  &hash_entry_count,
1776  &join_buckets_per_key,
1777  &join_column_per_key,
1778  cpu_thread_idx,
1779  cpu_thread_count] {
1780  const auto key_handler = OverlapsKeyHandler(
1781  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1782  &join_column_per_key[0],
1783  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1784  count_matches_baseline(count_buff,
1785  composite_key_dict,
1786  hash_entry_count,
1787  &key_handler,
1788  join_column_per_key[0].num_elems,
1789  cpu_thread_idx,
1790  cpu_thread_count);
1791  }));
1792  } else {
1793  counter_threads.push_back(std::async(
1794  std::launch::async,
1795  [count_buff,
1796  composite_key_dict,
1797  &key_component_count,
1798  &hash_entry_count,
1799  &join_column_per_key,
1800  &type_info_per_key,
1801  &sd_inner_proxy_per_key,
1802  &sd_outer_proxy_per_key,
1803  cpu_thread_idx,
1804  cpu_thread_count] {
1805  const auto key_handler = GenericKeyHandler(key_component_count,
1806  true,
1807  &join_column_per_key[0],
1808  &type_info_per_key[0],
1809  &sd_inner_proxy_per_key[0],
1810  &sd_outer_proxy_per_key[0]);
1811  count_matches_baseline(count_buff,
1812  composite_key_dict,
1813  hash_entry_count,
1814  &key_handler,
1815  join_column_per_key[0].num_elems,
1816  cpu_thread_idx,
1817  cpu_thread_count);
1818  }));
1819  }
1820  }
1821 
1822  for (auto& child : counter_threads) {
1823  child.get();
1824  }
1825 
1826  std::vector<int32_t> count_copy(hash_entry_count, 0);
1827  CHECK_GT(hash_entry_count, int64_t(0));
1828  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1830  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1831  std::vector<std::future<void>> pos_threads;
1832  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1833  pos_threads.push_back(std::async(
1834  std::launch::async,
1835  [&](const int thread_idx) {
1836  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1837  if (count_buff[i]) {
1838  pos_buff[i] = count_copy[i];
1839  }
1840  }
1841  },
1842  cpu_thread_idx));
1843  }
1844  for (auto& child : pos_threads) {
1845  child.get();
1846  }
1847 
1848  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1849  std::vector<std::future<void>> rowid_threads;
1850  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1851  if (join_buckets_per_key.size() > 0) {
1852  rowid_threads.push_back(std::async(
1853  std::launch::async,
1854  [buff,
1855  composite_key_dict,
1856  hash_entry_count,
1857  invalid_slot_val,
1858  &join_column_per_key,
1859  &join_buckets_per_key,
1860  cpu_thread_idx,
1861  cpu_thread_count] {
1862  const auto key_handler = OverlapsKeyHandler(
1863  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1864  &join_column_per_key[0],
1865  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1867  (buff,
1868  composite_key_dict,
1869  hash_entry_count,
1870  invalid_slot_val,
1871  &key_handler,
1872  join_column_per_key[0].num_elems,
1873  cpu_thread_idx,
1874  cpu_thread_count);
1875  }));
1876  } else {
1877  rowid_threads.push_back(std::async(std::launch::async,
1878  [buff,
1879  composite_key_dict,
1880  hash_entry_count,
1881  invalid_slot_val,
1882  key_component_count,
1883  &join_column_per_key,
1884  &type_info_per_key,
1885  &sd_inner_proxy_per_key,
1886  &sd_outer_proxy_per_key,
1887  cpu_thread_idx,
1888  cpu_thread_count] {
1889  const auto key_handler = GenericKeyHandler(
1890  key_component_count,
1891  true,
1892  &join_column_per_key[0],
1893  &type_info_per_key[0],
1894  &sd_inner_proxy_per_key[0],
1895  &sd_outer_proxy_per_key[0]);
1897  (buff,
1898  composite_key_dict,
1899  hash_entry_count,
1900  invalid_slot_val,
1901  &key_handler,
1902  join_column_per_key[0].num_elems,
1903  cpu_thread_idx,
1904  cpu_thread_count);
1905  }));
1906  }
1907  }
1908 
1909  for (auto& child : rowid_threads) {
1910  child.get();
1911  }
1912 }
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_baseline(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:209
GLOBAL void SUFFIX() count_matches_baseline(int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count 
)

Definition at line 1914 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1925  {
1926  fill_one_to_many_baseline_hash_table<int32_t>(buff,
1927  composite_key_dict,
1928  hash_entry_count,
1929  invalid_slot_val,
1930  key_component_count,
1931  join_column_per_key,
1932  type_info_per_key,
1933  join_bucket_info,
1934  sd_inner_proxy_per_key,
1935  sd_outer_proxy_per_key,
1936  cpu_thread_count);
1937 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count 
)

Definition at line 1939 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1950  {
1951  fill_one_to_many_baseline_hash_table<int64_t>(buff,
1952  composite_key_dict,
1953  hash_entry_count,
1954  invalid_slot_val,
1955  key_component_count,
1956  join_column_per_key,
1957  type_info_per_key,
1958  join_bucket_info,
1959  sd_inner_proxy_per_key,
1960  sd_outer_proxy_per_key,
1961  cpu_thread_count);
1962 }

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1381 of file HashJoinRuntime.cpp.

References count_matches(), fill_one_to_many_hash_table_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1388  {
1389  auto launch_count_matches = [count_buff = buff + hash_entry_info.hash_entry_count,
1390  invalid_slot_val,
1391  &join_column,
1392  &type_info,
1393  sd_inner_proxy,
1394  sd_outer_proxy](auto cpu_thread_idx,
1395  auto cpu_thread_count) {
1397  (count_buff,
1398  invalid_slot_val,
1399  join_column,
1400  type_info,
1401  sd_inner_proxy,
1402  sd_outer_proxy,
1403  cpu_thread_idx,
1404  cpu_thread_count);
1405  };
1406  auto launch_fill_row_ids = [hash_entry_count = hash_entry_info.hash_entry_count,
1407  buff,
1408  invalid_slot_val,
1409  &join_column,
1410  &type_info,
1411  sd_inner_proxy,
1412  sd_outer_proxy](auto cpu_thread_idx,
1413  auto cpu_thread_count) {
1415  (buff,
1416  hash_entry_count,
1417  invalid_slot_val,
1418  join_column,
1419  type_info,
1420  sd_inner_proxy,
1421  sd_outer_proxy,
1422  cpu_thread_idx,
1423  cpu_thread_count);
1424  };
1425 
1427  hash_entry_info.hash_entry_count,
1428  invalid_slot_val,
1429  join_column,
1430  type_info,
1431  sd_inner_proxy,
1432  sd_outer_proxy,
1433  cpu_thread_count,
1434  launch_count_matches,
1435  launch_fill_row_ids);
1436 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
size_t hash_entry_count
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1438 of file HashJoinRuntime.cpp.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1445  {
1446  auto bucket_normalization = hash_entry_info.bucket_normalization;
1447  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1448  auto launch_count_matches = [bucket_normalization,
1449  count_buff = buff + hash_entry_count,
1450  invalid_slot_val,
1451  &join_column,
1452  &type_info,
1453  sd_inner_proxy,
1454  sd_outer_proxy](auto cpu_thread_idx,
1455  auto cpu_thread_count) {
1457  (count_buff,
1458  invalid_slot_val,
1459  join_column,
1460  type_info,
1461  sd_inner_proxy,
1462  sd_outer_proxy,
1463  cpu_thread_idx,
1464  cpu_thread_count,
1465  bucket_normalization);
1466  };
1467  auto launch_fill_row_ids = [bucket_normalization,
1468  hash_entry_count,
1469  buff,
1470  invalid_slot_val,
1471  &join_column,
1472  &type_info,
1473  sd_inner_proxy,
1474  sd_outer_proxy](auto cpu_thread_idx,
1475  auto cpu_thread_count) {
1477  (buff,
1478  hash_entry_count,
1479  invalid_slot_val,
1480  join_column,
1481  type_info,
1482  sd_inner_proxy,
1483  sd_outer_proxy,
1484  cpu_thread_idx,
1485  cpu_thread_count,
1486  bucket_normalization);
1487  };
1488 
1490  hash_entry_count,
1491  invalid_slot_val,
1492  join_column,
1493  type_info,
1494  sd_inner_proxy,
1495  sd_outer_proxy,
1496  cpu_thread_count,
1497  launch_count_matches,
1498  launch_fill_row_ids);
1499 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_func,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_func 
)

Definition at line 1320 of file HashJoinRuntime.cpp.

References CHECK_GT, i, and inclusive_scan().

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_bucketized().

1329  {
1330  int32_t* pos_buff = buff;
1331  int32_t* count_buff = buff + hash_entry_count;
1332  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1333  std::vector<std::future<void>> counter_threads;
1334  for (unsigned cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1335  counter_threads.push_back(std::async(
1336  std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));
1337  }
1338 
1339  for (auto& child : counter_threads) {
1340  child.get();
1341  }
1342 
1343  std::vector<int32_t> count_copy(hash_entry_count, 0);
1344  CHECK_GT(hash_entry_count, int64_t(0));
1345  memcpy(count_copy.data() + 1, count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1346 #if HAVE_CUDA
1347  thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());
1348 #else
1350  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1351 #endif
1352  std::vector<std::future<void>> pos_threads;
1353  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1354  pos_threads.push_back(std::async(
1355  std::launch::async,
1356  [&](size_t thread_idx) {
1357  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1358  if (count_buff[i]) {
1359  pos_buff[i] = count_copy[i];
1360  }
1361  }
1362  },
1363  cpu_thread_idx));
1364  }
1365  for (auto& child : pos_threads) {
1366  child.get();
1367  }
1368 
1369  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1370  std::vector<std::future<void>> rowid_threads;
1371  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1372  rowid_threads.push_back(std::async(
1373  std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));
1374  }
1375 
1376  for (auto& child : rowid_threads) {
1377  child.get();
1378  }
1379 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1561 of file HashJoinRuntime.cpp.

References count_matches_sharded(), fill_one_to_many_hash_table_sharded_impl(), fill_row_ids_sharded(), and SUFFIX.

1569  {
1570  auto launch_count_matches = [count_buff = buff + hash_entry_count,
1571  invalid_slot_val,
1572  &join_column,
1573  &type_info,
1574  &shard_info
1575 #ifndef __CUDACC__
1576  ,
1577  sd_inner_proxy,
1578  sd_outer_proxy
1579 #endif
1580  ](auto cpu_thread_idx, auto cpu_thread_count) {
1581  return SUFFIX(count_matches_sharded)(count_buff,
1582  invalid_slot_val,
1583  join_column,
1584  type_info,
1585  shard_info
1586 #ifndef __CUDACC__
1587  ,
1588  sd_inner_proxy,
1589  sd_outer_proxy,
1590  cpu_thread_idx,
1591  cpu_thread_count
1592 #endif
1593  );
1594  };
1595 
1596  auto launch_fill_row_ids = [buff,
1597  hash_entry_count,
1598  invalid_slot_val,
1599  &join_column,
1600  &type_info,
1601  &shard_info
1602 #ifndef __CUDACC__
1603  ,
1604  sd_inner_proxy,
1605  sd_outer_proxy
1606 #endif
1607  ](auto cpu_thread_idx, auto cpu_thread_count) {
1608  return SUFFIX(fill_row_ids_sharded)(buff,
1609  hash_entry_count,
1610  invalid_slot_val,
1611  join_column,
1612  type_info,
1613  shard_info
1614 #ifndef __CUDACC__
1615  ,
1616  sd_inner_proxy,
1617  sd_outer_proxy,
1618  cpu_thread_idx,
1619  cpu_thread_count);
1620 #endif
1621  };
1622 
1624  hash_entry_count,
1625  invalid_slot_val,
1626  join_column,
1627  type_info,
1628  shard_info
1629 #ifndef __CUDACC__
1630  ,
1631  sd_inner_proxy,
1632  sd_outer_proxy,
1633  cpu_thread_count
1634 #endif
1635  ,
1636  launch_count_matches,
1637  launch_fill_row_ids);
1638 }
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_launcher,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_launcher 
)

Definition at line 1502 of file HashJoinRuntime.cpp.

References CHECK_GT, i, and inclusive_scan().

Referenced by fill_one_to_many_hash_table_sharded().

1513  {
1514  int32_t* pos_buff = buff;
1515  int32_t* count_buff = buff + hash_entry_count;
1516  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1517  std::vector<std::future<void>> counter_threads;
1518  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1519  counter_threads.push_back(std::async(
1520  std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));
1521  }
1522 
1523  for (auto& child : counter_threads) {
1524  child.get();
1525  }
1526 
1527  std::vector<int32_t> count_copy(hash_entry_count, 0);
1528  CHECK_GT(hash_entry_count, int64_t(0));
1529  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1531  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1532  std::vector<std::future<void>> pos_threads;
1533  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1534  pos_threads.push_back(std::async(
1535  std::launch::async,
1536  [&](const unsigned thread_idx) {
1537  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1538  if (count_buff[i]) {
1539  pos_buff[i] = count_copy[i];
1540  }
1541  }
1542  },
1543  cpu_thread_idx));
1544  }
1545  for (auto& child : pos_threads) {
1546  child.get();
1547  }
1548 
1549  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1550  std::vector<std::future<void>> rowid_threads;
1551  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1552  rowid_threads.push_back(std::async(
1553  std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));
1554  }
1555 
1556  for (auto& child : rowid_threads) {
1557  child.get();
1558  }
1559 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 859 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

871  {
872  auto slot_sel = [&type_info](auto pos_buff, auto elem) {
873  return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);
874  };
875 
876  fill_row_ids_impl(buff,
877  hash_entry_count,
878  invalid_slot_val,
879  join_column,
880  type_info
881 #ifndef __CUDACC__
882  ,
883  sd_inner_proxy,
884  sd_outer_proxy,
885  cpu_thread_idx,
886  cpu_thread_count
887 #endif
888  ,
889  slot_sel);
890 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:39
const int64_t min_val
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1070 of file HashJoinRuntime.cpp.

References CHECK_NE, g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and omnisci.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

1081  {
1082  int32_t* pos_buff = buff;
1083  int32_t* count_buff = buff + hash_entry_count;
1084  int32_t* id_buff = count_buff + hash_entry_count;
1085 #ifdef __CUDACC__
1086  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1087  int32_t step = blockDim.x * gridDim.x;
1088 #else
1089  int32_t start = cpu_thread_idx;
1090  int32_t step = cpu_thread_count;
1091 #endif
1092 
1093  T key_scratch_buff[g_maximum_conditions_to_coalesce];
1094 #ifdef __CUDACC__
1095  assert(composite_key_dict);
1096 #endif
1097  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
1098  auto key_buff_handler = [composite_key_dict,
1099  hash_entry_count,
1100  pos_buff,
1101  invalid_slot_val,
1102  count_buff,
1103  id_buff,
1104  key_size_in_bytes](const int64_t row_index,
1105  const T* key_scratch_buff,
1106  const size_t key_component_count) {
1107  const T* matching_group =
1109  key_component_count,
1110  composite_key_dict,
1111  hash_entry_count,
1112  key_size_in_bytes);
1113  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
1114  int32_t* pos_ptr = pos_buff + entry_idx;
1115 #ifndef __CUDACC__
1116  CHECK_NE(*pos_ptr, invalid_slot_val);
1117 #endif
1118  const auto bin_idx = pos_ptr - pos_buff;
1119  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1120  id_buff[id_buff_idx] = static_cast<int32_t>(row_index);
1121  return 0;
1122  };
1123 
1124  JoinColumnTuple cols(
1125  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1126  for (auto& it : cols.slice(start, step)) {
1127  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1128  }
1129  return;
1130 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
#define CHECK_NE(x, y)
Definition: Logger.h:206
char * f
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 892 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

905  {
906  auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {
908  pos_buff, elem, type_info.min_val, bucket_normalization);
909  };
910  fill_row_ids_impl(buff,
911  hash_entry_count,
912  invalid_slot_val,
913  join_column,
914  type_info
915 #ifndef __CUDACC__
916  ,
917  sd_inner_proxy,
918  sd_outer_proxy,
919  cpu_thread_idx,
920  cpu_thread_count
921 #endif
922  ,
923  slot_sel);
924 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:31
#define SUFFIX(name)
const int64_t min_val
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 800 of file HashJoinRuntime.cpp.

References CHECK_GE, CHECK_NE, StringDictionary::INVALID_STR_ID, mapd_add, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by fill_row_ids(), fill_row_ids_bucketized(), fill_row_ids_sharded(), and fill_row_ids_sharded_bucketized().

813  {
814  int32_t* pos_buff = buff;
815  int32_t* count_buff = buff + hash_entry_count;
816  int32_t* id_buff = count_buff + hash_entry_count;
817 
818 #ifdef __CUDACC__
819  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
820  int32_t step = blockDim.x * gridDim.x;
821 #else
822  int32_t start = cpu_thread_idx;
823  int32_t step = cpu_thread_count;
824 #endif
825  JoinColumnTyped col{&join_column, &type_info};
826  for (auto item : col.slice(start, step)) {
827  const size_t index = item.index;
828  int64_t elem = item.element;
829  if (elem == type_info.null_val) {
830  if (type_info.uses_bw_eq) {
831  elem = type_info.translated_null_val;
832  } else {
833  continue;
834  }
835  }
836 #ifndef __CUDACC__
837  if (sd_inner_proxy &&
838  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
839  const auto outer_id = translate_str_id_to_outer_dict(
840  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
841  if (outer_id == StringDictionary::INVALID_STR_ID) {
842  continue;
843  }
844  elem = outer_id;
845  }
846  CHECK_GE(elem, type_info.min_val)
847  << "Element " << elem << " less than min val " << type_info.min_val;
848 #endif
849  auto pos_ptr = slot_selector(pos_buff, elem);
850 #ifndef __CUDACC__
851  CHECK_NE(*pos_ptr, invalid_slot_val);
852 #endif
853  const auto bin_idx = pos_ptr - pos_buff;
854  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
855  id_buff[id_buff_idx] = static_cast<int32_t>(index);
856  }
857 }
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define CHECK_NE(x, y)
Definition: Logger.h:206
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 988 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot_sharded(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

1001  {
1002  auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {
1003  return SUFFIX(get_hash_slot_sharded)(pos_buff,
1004  elem,
1005  type_info.min_val,
1006  shard_info.entry_count_per_shard,
1007  shard_info.num_shards,
1008  shard_info.device_count);
1009  };
1010 
1011  fill_row_ids_impl(buff,
1012  hash_entry_count,
1013  invalid_slot_val,
1014  join_column,
1015  type_info
1016 #ifndef __CUDACC__
1017  ,
1018  sd_inner_proxy,
1019  sd_outer_proxy,
1020  cpu_thread_idx,
1021  cpu_thread_count
1022 #endif
1023  ,
1024  slot_sel);
1025 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:60
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 1027 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot_sharded(), and SUFFIX.

1041  {
1042  auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,
1043  auto elem) {
1044  return SUFFIX(get_bucketized_hash_slot_sharded)(pos_buff,
1045  elem,
1046  type_info.min_val,
1047  shard_info.entry_count_per_shard,
1048  shard_info.num_shards,
1049  shard_info.device_count,
1050  bucket_normalization);
1051  };
1052 
1053  fill_row_ids_impl(buff,
1054  hash_entry_count,
1055  invalid_slot_val,
1056  join_column,
1057  type_info
1058 #ifndef __CUDACC__
1059  ,
1060  sd_inner_proxy,
1061  sd_outer_proxy,
1062  cpu_thread_idx,
1063  cpu_thread_count
1064 #endif
1065  ,
1066  slot_sel);
1067 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:45
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 927 of file HashJoinRuntime.cpp.

References CHECK_GE, CHECK_NE, StringDictionary::INVALID_STR_ID, mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

941  {
942 
943  int32_t* pos_buff = buff;
944  int32_t* count_buff = buff + hash_entry_count;
945  int32_t* id_buff = count_buff + hash_entry_count;
946 
947 #ifdef __CUDACC__
948  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
949  int32_t step = blockDim.x * gridDim.x;
950 #else
951  int32_t start = cpu_thread_idx;
952  int32_t step = cpu_thread_count;
953 #endif
954  JoinColumnTyped col{&join_column, &type_info};
955  for (auto item : col.slice(start, step)) {
956  const size_t index = item.index;
957  int64_t elem = item.element;
958  if (elem == type_info.null_val) {
959  if (type_info.uses_bw_eq) {
960  elem = type_info.translated_null_val;
961  } else {
962  continue;
963  }
964  }
965 #ifndef __CUDACC__
966  if (sd_inner_proxy &&
967  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
968  const auto outer_id = translate_str_id_to_outer_dict(
969  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
970  if (outer_id == StringDictionary::INVALID_STR_ID) {
971  continue;
972  }
973  elem = outer_id;
974  }
975  CHECK_GE(elem, type_info.min_val)
976  << "Element " << elem << " less than min val " << type_info.min_val;
977 #endif
978  auto* pos_ptr = slot_selector(pos_buff, elem);
979 #ifndef __CUDACC__
980  CHECK_NE(*pos_ptr, invalid_slot_val);
981 #endif
982  const auto bin_idx = pos_ptr - pos_buff;
983  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
984  id_buff[id_buff_idx] = static_cast<int32_t>(index);
985  }
986 }
#define CHECK_GE(x, y)
Definition: Logger.h:210
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define CHECK_NE(x, y)
Definition: Logger.h:206
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

template<typename T >
T* get_matching_baseline_hash_slot_at ( int8_t *  hash_buff,
const uint32_t  h,
const T *  key,
const size_t  key_component_count,
const int64_t  hash_entry_size 
)

Definition at line 420 of file HashJoinRuntime.cpp.

References cas_cst, get_invalid_key(), i, load_cst, store_cst, SUFFIX, omnisci.dtypes::T, and UNLIKELY.

Referenced by write_baseline_hash_slot().

424  {
425  uint32_t off = h * hash_entry_size;
426  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
427  T empty_key = SUFFIX(get_invalid_key)<T>();
428  T write_pending = SUFFIX(get_invalid_key)<T>() - 1;
429  if (UNLIKELY(*key == write_pending)) {
430  // Address the singularity case where the first column contains the pending
431  // write special value. Should never happen, but avoid doing wrong things.
432  return nullptr;
433  }
434  const bool success = cas_cst(row_ptr, &empty_key, write_pending);
435  if (success) {
436  if (key_component_count > 1) {
437  memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));
438  }
439  store_cst(row_ptr, *key);
440  return reinterpret_cast<T*>(row_ptr + key_component_count);
441  }
442  while (load_cst(row_ptr) == write_pending) {
443  // spin until the winning thread has finished writing the entire key
444  }
445  for (size_t i = 0; i < key_component_count; ++i) {
446  if (load_cst(row_ptr + i) != key[i]) {
447  return nullptr;
448  }
449  }
450  return reinterpret_cast<T*>(row_ptr + key_component_count);
451 }
#define SUFFIX(name)
#define load_cst(ptr)
DEVICE T SUFFIX() get_invalid_key()
#define cas_cst(ptr, expected, desired)
#define UNLIKELY(x)
Definition: likely.h:25
#define store_cst(ptr, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE NEVER_INLINE const T* SUFFIX() get_matching_baseline_hash_slot_readonly ( const T *  key,
const size_t  key_component_count,
const T *  composite_key_dict,
const int64_t  entry_count,
const size_t  key_size_in_bytes 
)

Definition at line 724 of file HashJoinRuntime.cpp.

References CHECK, keys_are_equal(), and MurmurHash1Impl().

Referenced by count_matches_baseline(), and fill_row_ids_baseline().

729  {
730  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
731  uint32_t off = h * key_component_count;
732  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
733  return &composite_key_dict[off];
734  }
735  uint32_t h_probe = (h + 1) % entry_count;
736  while (h_probe != h) {
737  off = h_probe * key_component_count;
738  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
739  return &composite_key_dict[off];
740  }
741  h_probe = (h_probe + 1) % entry_count;
742  }
743 #ifndef __CUDACC__
744  CHECK(false);
745 #else
746  assert(false);
747 #endif
748  return nullptr;
749 }
bool keys_are_equal(const T *key1, const T *key2, const size_t key_component_count)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename InputIterator , typename OutputIterator >
void inclusive_scan ( InputIterator  first,
InputIterator  last,
OutputIterator  out,
const size_t  thread_count 
)

Definition at line 1248 of file HashJoinRuntime.cpp.

References gpu_enabled::partial_sum().

Referenced by approximate_distinct_tuples_on_device_overlaps(), approximate_distinct_tuples_overlaps(), fill_one_to_many_baseline_hash_table(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_impl(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), fill_one_to_many_hash_table_sharded_impl(), and gpu_enabled::partial_sum().

1251  {
1252  using ElementType = typename InputIterator::value_type;
1253  using OffsetType = typename InputIterator::difference_type;
1254  const OffsetType elem_count = last - first;
1255  if (elem_count < 10000 || thread_count <= 1) {
1256  ElementType sum = 0;
1257  for (auto iter = first; iter != last; ++iter, ++out) {
1258  *out = sum += *iter;
1259  }
1260  return;
1261  }
1262 
1263  const OffsetType step = (elem_count + thread_count - 1) / thread_count;
1264  OffsetType start_off = 0;
1265  OffsetType end_off = std::min(step, elem_count);
1266  std::vector<ElementType> partial_sums(thread_count);
1267  std::vector<std::future<void>> counter_threads;
1268  for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,
1269  start_off = std::min(start_off + step, elem_count),
1270  end_off = std::min(start_off + step, elem_count)) {
1271  counter_threads.push_back(std::async(
1272  std::launch::async,
1273  [first, out](
1274  ElementType& partial_sum, const OffsetType start, const OffsetType end) {
1275  ElementType sum = 0;
1276  for (auto in_iter = first + start, out_iter = out + start;
1277  in_iter != (first + end);
1278  ++in_iter, ++out_iter) {
1279  *out_iter = sum += *in_iter;
1280  }
1281  partial_sum = sum;
1282  },
1283  std::ref(partial_sums[thread_idx]),
1284  start_off,
1285  end_off));
1286  }
1287  for (auto& child : counter_threads) {
1288  child.get();
1289  }
1290 
1291  ElementType sum = 0;
1292  for (auto& s : partial_sums) {
1293  s += sum;
1294  sum = s;
1295  }
1296 
1297  counter_threads.clear();
1298  start_off = std::min(step, elem_count);
1299  end_off = std::min(start_off + step, elem_count);
1300  for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,
1301  start_off = std::min(start_off + step, elem_count),
1302  end_off = std::min(start_off + step, elem_count)) {
1303  counter_threads.push_back(std::async(
1304  std::launch::async,
1305  [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {
1306  for (auto iter = out + start; iter != (out + end); ++iter) {
1307  *iter += prev_sum;
1308  }
1309  },
1310  partial_sums[thread_idx],
1311  start_off,
1312  end_off));
1313  }
1314  for (auto& child : counter_threads) {
1315  child.get();
1316  }
1317 }
DEVICE void partial_sum(ARGS &&...args)
Definition: gpu_enabled.h:87

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 334 of file HashJoinRuntime.cpp.

References get_invalid_key(), i, SUFFIX, and omnisci.dtypes::T.

Referenced by init_baseline_hash_join_buff_wrapper().

340  {
341 #ifdef __CUDACC__
342  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
343  int32_t step = blockDim.x * gridDim.x;
344 #else
345  int32_t start = cpu_thread_idx;
346  int32_t step = cpu_thread_count;
347 #endif
348  auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
349  const T empty_key = SUFFIX(get_invalid_key)<T>();
350  for (int64_t h = start; h < entry_count; h += step) {
351  int64_t off = h * hash_entry_size;
352  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
353  for (size_t i = 0; i < key_component_count; ++i) {
354  row_ptr[i] = empty_key;
355  }
356  if (with_val_slot) {
357  row_ptr[key_component_count] = invalid_slot_val;
358  }
359  }
360 }
#define SUFFIX(name)
DEVICE T SUFFIX() get_invalid_key()

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1640 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1646  {
1647  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1648  entry_count,
1649  key_component_count,
1650  with_val_slot,
1651  invalid_slot_val,
1652  cpu_thread_idx,
1653  cpu_thread_count);
1654 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1656 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1662  {
1663  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1664  entry_count,
1665  key_component_count,
1666  with_val_slot,
1667  invalid_slot_val,
1668  cpu_thread_idx,
1669  cpu_thread_count);
1670 }

+ Here is the caller graph for this function:

DEVICE void SUFFIX() init_hash_join_buff ( int32_t *  groups_buffer,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 92 of file HashJoinRuntime.cpp.

References i.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

96  {
97 #ifdef __CUDACC__
98  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
99  int32_t step = blockDim.x * gridDim.x;
100 #else
101  int32_t start = cpu_thread_idx;
102  int32_t step = cpu_thread_count;
103 #endif
104  for (int64_t i = start; i < hash_entry_count; i += step) {
105  groups_buffer[i] = invalid_slot_val;
106  }
107 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1692 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1700  {
1701  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1702  entry_count,
1703  invalid_slot_val,
1704  key_component_count,
1705  with_val_slot,
1706  key_handler,
1707  num_elems,
1708  cpu_thread_idx,
1709  cpu_thread_count);
1710 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1732 of file HashJoinRuntime.cpp.

1740  {
1741  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1742  entry_count,
1743  invalid_slot_val,
1744  key_component_count,
1745  with_val_slot,
1746  key_handler,
1747  num_elems,
1748  cpu_thread_idx,
1749  cpu_thread_count);
1750 }
template<typename T >
DEVICE int write_baseline_hash_slot ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 460 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and omnisci.dtypes::T.

468  {
469  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
470  T* matching_group = get_matching_baseline_hash_slot_at(
471  hash_buff, h, key, key_component_count, hash_entry_size);
472  if (!matching_group) {
473  uint32_t h_probe = (h + 1) % entry_count;
474  while (h_probe != h) {
475  matching_group = get_matching_baseline_hash_slot_at(
476  hash_buff, h_probe, key, key_component_count, hash_entry_size);
477  if (matching_group) {
478  break;
479  }
480  h_probe = (h_probe + 1) % entry_count;
481  }
482  }
483  if (!matching_group) {
484  return -2;
485  }
486  if (!with_val_slot) {
487  return 0;
488  }
489  if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {
490  return -1;
491  }
492  return 0;
493 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function: