OmniSciDB  eb3a3d0a03
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.cpp File Reference
+ Include dependency graph for HashJoinRuntime.cpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

 anonymous_namespace{HashJoinRuntime.cpp}
 

Macros

#define mapd_cas(address, compare, val)   __sync_val_compare_and_swap(address, compare, val)
 
#define cas_cst(ptr, expected, desired)
 
#define store_cst(ptr, val)   __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)
 
#define load_cst(ptr)   __atomic_load_n(ptr, __ATOMIC_SEQ_CST)
 
#define mapd_add(address, val)   __sync_fetch_and_add(address, val)
 

Functions

int64_t anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict (const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
 
DEVICE void SUFFIX() init_hash_join_buff (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
T * get_matching_baseline_hash_slot_at (int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() count_matches (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() count_matches_bucketized (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
GLOBAL void SUFFIX() count_matches_sharded (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE NEVER_INLINE const T
*SUFFIX() 
get_matching_baseline_hash_slot_readonly (const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline (int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_bucketized (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids_sharded (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl (uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl (double *bucket_sizes_for_thread, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename InputIterator , typename OutputIterator >
void inclusive_scan (InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
 
void fill_one_to_many_hash_table_sharded (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
void fill_one_to_many_baseline_hash_table (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const size_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 

Macro Definition Documentation

#define cas_cst (   ptr,
  expected,
  desired 
)
Value:
__atomic_compare_exchange_n( \
ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

Definition at line 430 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define load_cst (   ptr)    __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

Definition at line 434 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define mapd_add (   address,
  val 
)    __sync_fetch_and_add(address, val)
#define mapd_cas (   address,
  compare,
  val 
)    __sync_val_compare_and_swap(address, compare, val)
#define store_cst (   ptr,
  val 
)    __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

Definition at line 433 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2139 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2144  {
2145  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2146  CHECK(!join_column_per_key.empty());
2147 
2148  std::vector<std::future<void>> approx_distinct_threads;
2149  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2150  approx_distinct_threads.push_back(std::async(
2152  [&join_column_per_key,
2153  &type_info_per_key,
2154  b,
2155  hll_buffer_all_cpus,
2156  padded_size_bytes,
2157  thread_idx,
2158  thread_count] {
2159  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2160 
2161  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2162  false,
2163  &join_column_per_key[0],
2164  &type_info_per_key[0],
2165  nullptr,
2166  nullptr);
2168  nullptr,
2169  b,
2170  join_column_per_key[0].num_elems,
2171  &key_handler,
2172  thread_idx,
2173  thread_count);
2174  }));
2175  }
2176  for (auto& child : approx_distinct_threads) {
2177  child.get();
2178  }
2179 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl ( uint8_t *  hll_buffer,
int32_t *  row_count_buffer,
const uint32_t  b,
const int64_t  num_elems,
const KEY_HANDLER *  f,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1201 of file HashJoinRuntime.cpp.

References atomicMax(), g_maximum_conditions_to_coalesce, get_rank(), and MurmurHash64AImpl().

Referenced by approximate_distinct_tuples(), approximate_distinct_tuples_overlaps(), and approximate_distinct_tuples_range().

1211  {
1212 #ifdef __CUDACC__
1213  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1214  int32_t step = blockDim.x * gridDim.x;
1215 #else
1216  int32_t start = cpu_thread_idx;
1217  int32_t step = cpu_thread_count;
1218 #endif
1219 
1220  auto key_buff_handler = [b, hll_buffer, row_count_buffer](
1221  const int64_t entry_idx,
1222  const int64_t* key_scratch_buff,
1223  const size_t key_component_count) {
1224  if (row_count_buffer) {
1225  row_count_buffer[entry_idx] += 1;
1226  }
1227 
1228  const uint64_t hash =
1229  MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);
1230  const uint32_t index = hash >> (64 - b);
1231  const auto rank = get_rank(hash << b, 64 - b);
1232 #ifdef __CUDACC__
1233  atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);
1234 #else
1235  hll_buffer[index] = std::max(hll_buffer[index], rank);
1236 #endif
1237 
1238  return 0;
1239  };
1240 
1241  int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];
1242 
1243  JoinColumnTuple cols(
1244  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1245  for (auto& it : cols.slice(start, step)) {
1246  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1247  }
1248 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
char * f
__device__ double atomicMax(double *address, double val)
const size_t g_maximum_conditions_to_coalesce
FORCE_INLINE DEVICE uint64_t MurmurHash64AImpl(const void *key, int len, uint64_t seed)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2181 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

2189  {
2190  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2191  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2192  CHECK(!join_column_per_key.empty());
2193 
2194  std::vector<std::future<void>> approx_distinct_threads;
2195  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2196  approx_distinct_threads.push_back(std::async(
2198  [&join_column_per_key,
2199  &join_buckets_per_key,
2200  &row_counts,
2201  b,
2202  hll_buffer_all_cpus,
2203  padded_size_bytes,
2204  thread_idx,
2205  thread_count] {
2206  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2207 
2208  const auto key_handler = OverlapsKeyHandler(
2209  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2210  &join_column_per_key[0],
2211  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2213  row_counts.data(),
2214  b,
2215  join_column_per_key[0].num_elems,
2216  &key_handler,
2217  thread_idx,
2218  thread_count);
2219  }));
2220  }
2221  for (auto& child : approx_distinct_threads) {
2222  child.get();
2223  }
2224 
2226  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2227 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2229 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2238  {
2239  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2240  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2241  CHECK(!join_column_per_key.empty());
2242 
2243  std::vector<std::future<void>> approx_distinct_threads;
2244  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2245  approx_distinct_threads.push_back(std::async(
2247  [&join_column_per_key,
2248  &join_buckets_per_key,
2249  &row_counts,
2250  b,
2251  hll_buffer_all_cpus,
2252  padded_size_bytes,
2253  thread_idx,
2254  is_compressed,
2255  thread_count] {
2256  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2257 
2258  const auto key_handler = RangeKeyHandler(
2259  is_compressed,
2260  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2261  &join_column_per_key[0],
2262  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2264  row_counts.data(),
2265  b,
2266  join_column_per_key[0].num_elems,
2267  &key_handler,
2268  thread_idx,
2269  thread_count);
2270  }));
2271  }
2272  for (auto& child : approx_distinct_threads) {
2273  child.get();
2274  }
2275 
2277  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2278 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl ( double *  bucket_sizes_for_thread,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1271 of file HashJoinRuntime.cpp.

References atomicMin(), fixed_width_double_decode_noinline(), anonymous_namespace{Utm.h}::N, JoinColumnIterator::ptr(), and SUFFIX.

1280  {
1281 #ifdef __CUDACC__
1282  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1283  int32_t step = blockDim.x * gridDim.x;
1284 #else
1285  int32_t start = cpu_thread_idx;
1286  int32_t step = cpu_thread_count;
1287 #endif
1288  JoinColumnIterator it(join_column, type_info, start, step);
1289  for (; it; ++it) {
1290  // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)
1291  double bounds[2 * N];
1292  for (size_t j = 0; j < 2 * N; j++) {
1293  bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);
1294  }
1295 
1296  for (size_t j = 0; j < N; j++) {
1297  const auto diff = bounds[j + N] - bounds[j];
1298 #ifdef __CUDACC__
1299  if (diff > bucket_size_thresholds[j]) {
1300  atomicMin(&bucket_sizes_for_thread[j], diff);
1301  }
1302 #else
1303  if (diff < bucket_size_thresholds[j] && diff > bucket_sizes_for_thread[j]) {
1304  bucket_sizes_for_thread[j] = diff;
1305  }
1306 #endif
1307  }
1308  }
1309 }
__device__ double atomicMin(double *address, double val)
#define SUFFIX(name)
Iterates over the rows of a JoinColumn across multiple fragments/chunks.
constexpr unsigned N
Definition: Utm.h:112
DEVICE NEVER_INLINE double SUFFIX() fixed_width_double_decode_noinline(const int8_t *byte_stream, const int64_t pos)
Definition: DecodersImpl.h:134

+ Here is the call graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2280 of file HashJoinRuntime.cpp.

References threading_serial::async(), and i.

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

2284  {
2285  std::vector<std::vector<double>> bucket_sizes_for_threads;
2286  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2287  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2288  }
2289  std::vector<std::future<void>> threads;
2290  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2291  threads.push_back(std::async(std::launch::async,
2292  compute_bucket_sizes_impl<2>,
2293  bucket_sizes_for_threads[thread_idx].data(),
2294  &join_column,
2295  &type_info,
2296  bucket_size_thresholds.data(),
2297  thread_idx,
2298  thread_count));
2299  }
2300  for (auto& child : threads) {
2301  child.get();
2302  }
2303 
2304  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2305  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2306  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2307  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2308  }
2309  }
2310  }
2311 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 674 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

685  {
686  auto slot_sel = [&type_info](auto count_buff, auto elem) {
687  return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);
688  };
689  count_matches_impl(count_buff,
690  invalid_slot_val,
691  join_column,
692  type_info
693 #ifndef __CUDACC__
694  ,
695  sd_inner_proxy,
696  sd_outer_proxy,
697  cpu_thread_idx,
698  cpu_thread_count
699 #endif
700  ,
701  slot_sel);
702 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:74
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline ( int32_t *  count_buff,
const T *  composite_key_dict,
const int64_t  entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 818 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and omnisci.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

828  {
829 #ifdef __CUDACC__
830  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
831  int32_t step = blockDim.x * gridDim.x;
832 #else
833  int32_t start = cpu_thread_idx;
834  int32_t step = cpu_thread_count;
835 #endif
836 #ifdef __CUDACC__
837  assert(composite_key_dict);
838 #endif
839  T key_scratch_buff[g_maximum_conditions_to_coalesce];
840  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
841  auto key_buff_handler = [composite_key_dict,
842  entry_count,
843  count_buff,
844  key_size_in_bytes](const int64_t row_entry_idx,
845  const T* key_scratch_buff,
846  const size_t key_component_count) {
847  const auto matching_group =
849  key_component_count,
850  composite_key_dict,
851  entry_count,
852  key_size_in_bytes);
853  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
854  mapd_add(&count_buff[entry_idx], int32_t(1));
855  return 0;
856  };
857 
858  JoinColumnTuple cols(
859  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
860  for (auto& it : cols.slice(start, step)) {
861  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
862  }
863 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
char * f
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_bucketized ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 704 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

716  {
717  auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {
719  count_buff, elem, type_info.min_val, bucket_normalization);
720  };
721  count_matches_impl(count_buff,
722  invalid_slot_val,
723  join_column,
724  type_info
725 #ifndef __CUDACC__
726  ,
727  sd_inner_proxy,
728  sd_outer_proxy,
729  cpu_thread_idx,
730  cpu_thread_count
731 #endif
732  ,
733  slot_sel);
734 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
#define SUFFIX(name)
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 626 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, mapd_add, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by count_matches(), and count_matches_bucketized().

638  {
639 #ifdef __CUDACC__
640  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
641  int32_t step = blockDim.x * gridDim.x;
642 #else
643  int32_t start = cpu_thread_idx;
644  int32_t step = cpu_thread_count;
645 #endif
646  JoinColumnTyped col{&join_column, &type_info};
647  for (auto item : col.slice(start, step)) {
648  int64_t elem = item.element;
649  if (elem == type_info.null_val) {
650  if (type_info.uses_bw_eq) {
651  elem = type_info.translated_null_val;
652  } else {
653  continue;
654  }
655  }
656 #ifndef __CUDACC__
657  if (sd_inner_proxy &&
658  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
659  const auto outer_id = translate_str_id_to_outer_dict(
660  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
661  if (outer_id == StringDictionary::INVALID_STR_ID) {
662  continue;
663  }
664  elem = outer_id;
665  }
666  CHECK_GE(elem, type_info.min_val)
667  << "Element " << elem << " less than min val " << type_info.min_val;
668 #endif
669  auto* entry_ptr = slot_selector(count_buff, elem);
670  mapd_add(entry_ptr, int32_t(1));
671  }
672 }
#define CHECK_GE(x, y)
Definition: Logger.h:222
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_sharded ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 736 of file HashJoinRuntime.cpp.

References CHECK_GE, get_hash_slot_sharded(), StringDictionary::INVALID_STR_ID, mapd_add, SUFFIX, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

748  {
749 #ifdef __CUDACC__
750  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
751  int32_t step = blockDim.x * gridDim.x;
752 #else
753  int32_t start = cpu_thread_idx;
754  int32_t step = cpu_thread_count;
755 #endif
756  JoinColumnTyped col{&join_column, &type_info};
757  for (auto item : col.slice(start, step)) {
758  int64_t elem = item.element;
759  if (elem == type_info.null_val) {
760  if (type_info.uses_bw_eq) {
761  elem = type_info.translated_null_val;
762  } else {
763  continue;
764  }
765  }
766 #ifndef __CUDACC__
767  if (sd_inner_proxy &&
768  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
769  const auto outer_id = translate_str_id_to_outer_dict(
770  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
771  if (outer_id == StringDictionary::INVALID_STR_ID) {
772  continue;
773  }
774  elem = outer_id;
775  }
776  CHECK_GE(elem, type_info.min_val)
777  << "Element " << elem << " less than min val " << type_info.min_val;
778 #endif
779  int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,
780  elem,
781  type_info.min_val,
782  shard_info.entry_count_per_shard,
783  shard_info.num_shards,
784  shard_info.device_count);
785  mapd_add(entry_ptr, int32_t(1));
786  }
787 }
const size_t num_shards
#define CHECK_GE(x, y)
Definition: Logger.h:222
#define SUFFIX(name)
const int device_count
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const size_t entry_count_per_shard
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
const int64_t max_val
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:95
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const FILL_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 548 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, and omnisci.dtypes::T.

557  {
558 #ifdef __CUDACC__
559  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
560  int32_t step = blockDim.x * gridDim.x;
561 #else
562  int32_t start = cpu_thread_idx;
563  int32_t step = cpu_thread_count;
564 #endif
565 
566  T key_scratch_buff[g_maximum_conditions_to_coalesce];
567  const size_t key_size_in_bytes = key_component_count * sizeof(T);
568  const size_t hash_entry_size =
569  (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
570  auto key_buff_handler = [hash_buff,
571  entry_count,
572  with_val_slot,
573  invalid_slot_val,
574  key_size_in_bytes,
575  hash_entry_size,
576  &for_semi_join](const int64_t entry_idx,
577  const T* key_scratch_buffer,
578  const size_t key_component_count) {
579  if (for_semi_join) {
580  return write_baseline_hash_slot_for_semi_join<T>(entry_idx,
581  hash_buff,
582  entry_count,
583  key_scratch_buffer,
584  key_component_count,
585  with_val_slot,
586  invalid_slot_val,
587  key_size_in_bytes,
588  hash_entry_size);
589  } else {
590  return write_baseline_hash_slot<T>(entry_idx,
591  hash_buff,
592  entry_count,
593  key_scratch_buffer,
594  key_component_count,
595  with_val_slot,
596  invalid_slot_val,
597  key_size_in_bytes,
598  hash_entry_size);
599  }
600  };
601 
602  JoinColumnTuple cols(
603  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
604  for (auto& it : cols.slice(start, step)) {
605  const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
606  if (err) {
607  return err;
608  }
609  }
610  return 0;
611 }
char * f
const size_t g_maximum_conditions_to_coalesce
int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1738 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1747  {
1748  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1749  entry_count,
1750  invalid_slot_val,
1751  for_semi_join,
1752  key_component_count,
1753  with_val_slot,
1754  key_handler,
1755  num_elems,
1756  cpu_thread_idx,
1757  cpu_thread_count);
1758 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1802 of file HashJoinRuntime.cpp.

1811  {
1812  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1813  entry_count,
1814  invalid_slot_val,
1815  for_semi_join,
1816  key_component_count,
1817  with_val_slot,
1818  key_handler,
1819  num_elems,
1820  cpu_thread_idx,
1821  cpu_thread_count);
1822 }
DEVICE int SUFFIX() fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 197 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

205  {
206  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
207  : SUFFIX(fill_one_to_one_hashtable);
208  auto hashtable_filling_func = [&](auto elem, size_t index) {
209  auto entry_ptr = SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
210  return filling_func(index, entry_ptr, invalid_slot_val);
211  };
212 
213  return fill_hash_join_buff_impl(buff,
214  invalid_slot_val,
215  join_column,
216  type_info,
217  sd_inner_proxy,
218  sd_outer_proxy,
219  cpu_thread_idx,
220  cpu_thread_count,
221  hashtable_filling_func);
222 }
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:74
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 168 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

177  {
178  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
179  : SUFFIX(fill_one_to_one_hashtable);
180  auto hashtable_filling_func = [&](auto elem, size_t index) {
181  auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(
182  buff, elem, type_info.min_val, bucket_normalization);
183  return filling_func(index, entry_ptr, invalid_slot_val);
184  };
185 
186  return fill_hash_join_buff_impl(buff,
187  invalid_slot_val,
188  join_column,
189  type_info,
190  sd_inner_proxy,
191  sd_outer_proxy,
192  cpu_thread_idx,
193  cpu_thread_count,
194  hashtable_filling_func);
195 }
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 121 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff(), and fill_hash_join_buff_bucketized().

129  {
130 #ifdef __CUDACC__
131  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
132  int32_t step = blockDim.x * gridDim.x;
133 #else
134  int32_t start = cpu_thread_idx;
135  int32_t step = cpu_thread_count;
136 #endif
137  JoinColumnTyped col{&join_column, &type_info};
138  for (auto item : col.slice(start, step)) {
139  const size_t index = item.index;
140  int64_t elem = item.element;
141  if (elem == type_info.null_val) {
142  if (type_info.uses_bw_eq) {
143  elem = type_info.translated_null_val;
144  } else {
145  continue;
146  }
147  }
148 #ifndef __CUDACC__
149  if (sd_inner_proxy &&
150  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
151  const auto outer_id = translate_str_id_to_outer_dict(
152  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
153  if (outer_id == StringDictionary::INVALID_STR_ID) {
154  continue;
155  }
156  elem = outer_id;
157  }
158  CHECK_GE(elem, type_info.min_val)
159  << "Element " << elem << " less than min val " << type_info.min_val;
160 #endif
161  if (filling_func(elem, index)) {
162  return -1;
163  }
164  }
165  return 0;
166 };
#define CHECK_GE(x, y)
Definition: Logger.h:222
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
const int64_t max_val
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 316 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded().

325  {
326  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
327  : SUFFIX(fill_one_to_one_hashtable);
328  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
329  auto entry_ptr = SUFFIX(get_hash_slot_sharded_opt)(buff,
330  elem,
331  type_info.min_val,
332  shard_info.entry_count_per_shard,
333  shard,
334  shard_info.num_shards,
335  shard_info.device_count);
336  return filling_func(index, entry_ptr, invalid_slot_val);
337  };
338 
340  invalid_slot_val,
341  join_column,
342  type_info,
343  shard_info,
344  sd_inner_proxy,
345  sd_outer_proxy,
346  cpu_thread_idx,
347  cpu_thread_count,
348  hashtable_filling_func);
349 }
const size_t num_shards
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
#define SUFFIX(name)
const int device_count
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
const size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:124
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 277 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded_bucketized().

288  {
289  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
290  : SUFFIX(fill_one_to_one_hashtable);
291  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
292  auto entry_ptr =
294  elem,
295  type_info.min_val,
296  shard_info.entry_count_per_shard,
297  shard,
298  shard_info.num_shards,
299  shard_info.device_count,
300  bucket_normalization);
301  return filling_func(index, entry_ptr, invalid_slot_val);
302  };
303 
305  invalid_slot_val,
306  join_column,
307  type_info,
308  shard_info,
309  sd_inner_proxy,
310  sd_outer_proxy,
311  cpu_thread_idx,
312  cpu_thread_count,
313  hashtable_filling_func);
314 }
const size_t num_shards
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:109
#define SUFFIX(name)
const int device_count
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
const size_t entry_count_per_shard
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 225 of file HashJoinRuntime.cpp.

References CHECK_GE, StringDictionary::INVALID_STR_ID, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, ShardInfo::num_shards, ShardInfo::shard, SHARD_FOR_KEY, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_sharded(), and fill_hash_join_buff_sharded_bucketized().

234  {
235 #ifdef __CUDACC__
236  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
237  int32_t step = blockDim.x * gridDim.x;
238 #else
239  int32_t start = cpu_thread_idx;
240  int32_t step = cpu_thread_count;
241 #endif
242  JoinColumnTyped col{&join_column, &type_info};
243  for (auto item : col.slice(start, step)) {
244  const size_t index = item.index;
245  int64_t elem = item.element;
246  size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);
247  if (shard != shard_info.shard) {
248  continue;
249  }
250  if (elem == type_info.null_val) {
251  if (type_info.uses_bw_eq) {
252  elem = type_info.translated_null_val;
253  } else {
254  continue;
255  }
256  }
257 #ifndef __CUDACC__
258  if (sd_inner_proxy &&
259  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
260  const auto outer_id = translate_str_id_to_outer_dict(
261  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
262  if (outer_id == StringDictionary::INVALID_STR_ID) {
263  continue;
264  }
265  elem = outer_id;
266  }
267  CHECK_GE(elem, type_info.min_val)
268  << "Element " << elem << " less than min val " << type_info.min_val;
269 #endif
270  if (filling_func(elem, shard, index)) {
271  return -1;
272  }
273  }
274  return 0;
275 }
const size_t num_shards
#define CHECK_GE(x, y)
Definition: Logger.h:222
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
const size_t shard
const int64_t max_val
const int64_t min_val
#define SHARD_FOR_KEY(key, num_shards)
Definition: shard_key.h:20

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
void fill_one_to_many_baseline_hash_table ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const size_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 1867 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, count_matches_baseline(), fill_row_ids_baseline(), i, inclusive_scan(), and SUFFIX.

1880  {
1881  int32_t* pos_buff = buff;
1882  int32_t* count_buff = buff + hash_entry_count;
1883  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1884  std::vector<std::future<void>> counter_threads;
1885  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1886  if (is_range_join) {
1887  counter_threads.push_back(std::async(
1889  [count_buff,
1890  composite_key_dict,
1891  &hash_entry_count,
1892  &join_buckets_per_key,
1893  &join_column_per_key,
1894  &is_geo_compressed,
1895  cpu_thread_idx,
1896  cpu_thread_count] {
1897  const auto key_handler = RangeKeyHandler(
1898  is_geo_compressed,
1899  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1900  &join_column_per_key[0],
1901  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1902  count_matches_baseline(count_buff,
1903  composite_key_dict,
1904  hash_entry_count,
1905  &key_handler,
1906  join_column_per_key[0].num_elems,
1907  cpu_thread_idx,
1908  cpu_thread_count);
1909  }));
1910  } else if (join_buckets_per_key.size() > 0) {
1911  counter_threads.push_back(std::async(
1913  [count_buff,
1914  composite_key_dict,
1915  &hash_entry_count,
1916  &join_buckets_per_key,
1917  &join_column_per_key,
1918  cpu_thread_idx,
1919  cpu_thread_count] {
1920  const auto key_handler = OverlapsKeyHandler(
1921  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1922  &join_column_per_key[0],
1923  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1924  count_matches_baseline(count_buff,
1925  composite_key_dict,
1926  hash_entry_count,
1927  &key_handler,
1928  join_column_per_key[0].num_elems,
1929  cpu_thread_idx,
1930  cpu_thread_count);
1931  }));
1932  } else {
1933  counter_threads.push_back(std::async(
1935  [count_buff,
1936  composite_key_dict,
1937  &key_component_count,
1938  &hash_entry_count,
1939  &join_column_per_key,
1940  &type_info_per_key,
1941  &sd_inner_proxy_per_key,
1942  &sd_outer_proxy_per_key,
1943  cpu_thread_idx,
1944  cpu_thread_count] {
1945  const auto key_handler = GenericKeyHandler(key_component_count,
1946  true,
1947  &join_column_per_key[0],
1948  &type_info_per_key[0],
1949  &sd_inner_proxy_per_key[0],
1950  &sd_outer_proxy_per_key[0]);
1951  count_matches_baseline(count_buff,
1952  composite_key_dict,
1953  hash_entry_count,
1954  &key_handler,
1955  join_column_per_key[0].num_elems,
1956  cpu_thread_idx,
1957  cpu_thread_count);
1958  }));
1959  }
1960  }
1961 
1962  for (auto& child : counter_threads) {
1963  child.get();
1964  }
1965 
1966  std::vector<int32_t> count_copy(hash_entry_count, 0);
1967  CHECK_GT(hash_entry_count, int64_t(0));
1968  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1970  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1971  std::vector<std::future<void>> pos_threads;
1972  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1973  pos_threads.push_back(std::async(
1975  [&](const int thread_idx) {
1976  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1977  if (count_buff[i]) {
1978  pos_buff[i] = count_copy[i];
1979  }
1980  }
1981  },
1982  cpu_thread_idx));
1983  }
1984  for (auto& child : pos_threads) {
1985  child.get();
1986  }
1987 
1988  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1989  std::vector<std::future<void>> rowid_threads;
1990  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1991  if (is_range_join) {
1992  rowid_threads.push_back(std::async(
1994  [buff,
1995  composite_key_dict,
1996  hash_entry_count,
1997  invalid_slot_val,
1998  &join_column_per_key,
1999  &join_buckets_per_key,
2000  &is_geo_compressed,
2001  cpu_thread_idx,
2002  cpu_thread_count] {
2003  const auto key_handler = RangeKeyHandler(
2004  is_geo_compressed,
2005  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2006  &join_column_per_key[0],
2007  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2009  (buff,
2010  composite_key_dict,
2011  hash_entry_count,
2012  invalid_slot_val,
2013  &key_handler,
2014  join_column_per_key[0].num_elems,
2015  cpu_thread_idx,
2016  cpu_thread_count);
2017  }));
2018  } else if (join_buckets_per_key.size() > 0) {
2019  rowid_threads.push_back(std::async(
2021  [buff,
2022  composite_key_dict,
2023  hash_entry_count,
2024  invalid_slot_val,
2025  &join_column_per_key,
2026  &join_buckets_per_key,
2027  cpu_thread_idx,
2028  cpu_thread_count] {
2029  const auto key_handler = OverlapsKeyHandler(
2030  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2031  &join_column_per_key[0],
2032  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2034  (buff,
2035  composite_key_dict,
2036  hash_entry_count,
2037  invalid_slot_val,
2038  &key_handler,
2039  join_column_per_key[0].num_elems,
2040  cpu_thread_idx,
2041  cpu_thread_count);
2042  }));
2043  } else {
2044  rowid_threads.push_back(std::async(std::launch::async,
2045  [buff,
2046  composite_key_dict,
2047  hash_entry_count,
2048  invalid_slot_val,
2049  key_component_count,
2050  &join_column_per_key,
2051  &type_info_per_key,
2052  &sd_inner_proxy_per_key,
2053  &sd_outer_proxy_per_key,
2054  cpu_thread_idx,
2055  cpu_thread_count] {
2056  const auto key_handler = GenericKeyHandler(
2057  key_component_count,
2058  true,
2059  &join_column_per_key[0],
2060  &type_info_per_key[0],
2061  &sd_inner_proxy_per_key[0],
2062  &sd_outer_proxy_per_key[0]);
2064  (buff,
2065  composite_key_dict,
2066  hash_entry_count,
2067  invalid_slot_val,
2068  &key_handler,
2069  join_column_per_key[0].num_elems,
2070  cpu_thread_idx,
2071  cpu_thread_count);
2072  }));
2073  }
2074  }
2075 
2076  for (auto& child : rowid_threads) {
2077  child.get();
2078  }
2079 }
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_baseline(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:221
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() count_matches_baseline(int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 2081 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2094  {
2095  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2096  composite_key_dict,
2097  hash_entry_count,
2098  invalid_slot_val,
2099  key_component_count,
2100  join_column_per_key,
2101  type_info_per_key,
2102  join_bucket_info,
2103  sd_inner_proxy_per_key,
2104  sd_outer_proxy_per_key,
2105  cpu_thread_count,
2106  is_range_join,
2107  is_geo_compressed);
2108 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 2110 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2123  {
2124  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2125  composite_key_dict,
2126  hash_entry_count,
2127  invalid_slot_val,
2128  key_component_count,
2129  join_column_per_key,
2130  type_info_per_key,
2131  join_bucket_info,
2132  sd_inner_proxy_per_key,
2133  sd_outer_proxy_per_key,
2134  cpu_thread_count,
2135  is_range_join,
2136  is_geo_compressed);
2137 }

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1447 of file HashJoinRuntime.cpp.

References count_matches(), fill_one_to_many_hash_table_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1454  {
1455  auto launch_count_matches = [count_buff = buff + hash_entry_info.hash_entry_count,
1456  invalid_slot_val,
1457  &join_column,
1458  &type_info,
1459  sd_inner_proxy,
1460  sd_outer_proxy](auto cpu_thread_idx,
1461  auto cpu_thread_count) {
1463  (count_buff,
1464  invalid_slot_val,
1465  join_column,
1466  type_info,
1467  sd_inner_proxy,
1468  sd_outer_proxy,
1469  cpu_thread_idx,
1470  cpu_thread_count);
1471  };
1472  auto launch_fill_row_ids = [hash_entry_count = hash_entry_info.hash_entry_count,
1473  buff,
1474  invalid_slot_val,
1475  &join_column,
1476  &type_info,
1477  sd_inner_proxy,
1478  sd_outer_proxy](auto cpu_thread_idx,
1479  auto cpu_thread_count) {
1481  (buff,
1482  hash_entry_count,
1483  invalid_slot_val,
1484  join_column,
1485  type_info,
1486  sd_inner_proxy,
1487  sd_outer_proxy,
1488  cpu_thread_idx,
1489  cpu_thread_count);
1490  };
1491 
1493  hash_entry_info.hash_entry_count,
1494  invalid_slot_val,
1495  join_column,
1496  type_info,
1497  sd_inner_proxy,
1498  sd_outer_proxy,
1499  cpu_thread_count,
1500  launch_count_matches,
1501  launch_fill_row_ids);
1502 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
size_t hash_entry_count
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1504 of file HashJoinRuntime.cpp.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1511  {
1512  auto bucket_normalization = hash_entry_info.bucket_normalization;
1513  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1514  auto launch_count_matches = [bucket_normalization,
1515  count_buff = buff + hash_entry_count,
1516  invalid_slot_val,
1517  &join_column,
1518  &type_info,
1519  sd_inner_proxy,
1520  sd_outer_proxy](auto cpu_thread_idx,
1521  auto cpu_thread_count) {
1523  (count_buff,
1524  invalid_slot_val,
1525  join_column,
1526  type_info,
1527  sd_inner_proxy,
1528  sd_outer_proxy,
1529  cpu_thread_idx,
1530  cpu_thread_count,
1531  bucket_normalization);
1532  };
1533  auto launch_fill_row_ids = [bucket_normalization,
1534  hash_entry_count,
1535  buff,
1536  invalid_slot_val,
1537  &join_column,
1538  &type_info,
1539  sd_inner_proxy,
1540  sd_outer_proxy](auto cpu_thread_idx,
1541  auto cpu_thread_count) {
1543  (buff,
1544  hash_entry_count,
1545  invalid_slot_val,
1546  join_column,
1547  type_info,
1548  sd_inner_proxy,
1549  sd_outer_proxy,
1550  cpu_thread_idx,
1551  cpu_thread_count,
1552  bucket_normalization);
1553  };
1554 
1556  hash_entry_count,
1557  invalid_slot_val,
1558  join_column,
1559  type_info,
1560  sd_inner_proxy,
1561  sd_outer_proxy,
1562  cpu_thread_count,
1563  launch_count_matches,
1564  launch_fill_row_ids);
1565 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_func,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_func 
)

Definition at line 1386 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, i, and inclusive_scan().

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_bucketized().

1395  {
1396  int32_t* pos_buff = buff;
1397  int32_t* count_buff = buff + hash_entry_count;
1398  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1399  std::vector<std::future<void>> counter_threads;
1400  for (unsigned cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1401  counter_threads.push_back(std::async(
1402  std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));
1403  }
1404 
1405  for (auto& child : counter_threads) {
1406  child.get();
1407  }
1408 
1409  std::vector<int32_t> count_copy(hash_entry_count, 0);
1410  CHECK_GT(hash_entry_count, int64_t(0));
1411  memcpy(count_copy.data() + 1, count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1412 #if HAVE_CUDA
1413  thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());
1414 #else
1416  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1417 #endif
1418  std::vector<std::future<void>> pos_threads;
1419  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1420  pos_threads.push_back(std::async(
1422  [&](size_t thread_idx) {
1423  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1424  if (count_buff[i]) {
1425  pos_buff[i] = count_copy[i];
1426  }
1427  }
1428  },
1429  cpu_thread_idx));
1430  }
1431  for (auto& child : pos_threads) {
1432  child.get();
1433  }
1434 
1435  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1436  std::vector<std::future<void>> rowid_threads;
1437  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1438  rowid_threads.push_back(std::async(
1439  std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));
1440  }
1441 
1442  for (auto& child : rowid_threads) {
1443  child.get();
1444  }
1445 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:221
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1627 of file HashJoinRuntime.cpp.

References count_matches_sharded(), fill_one_to_many_hash_table_sharded_impl(), fill_row_ids_sharded(), and SUFFIX.

1635  {
1636  auto launch_count_matches = [count_buff = buff + hash_entry_count,
1637  invalid_slot_val,
1638  &join_column,
1639  &type_info,
1640  &shard_info
1641 #ifndef __CUDACC__
1642  ,
1643  sd_inner_proxy,
1644  sd_outer_proxy
1645 #endif
1646  ](auto cpu_thread_idx, auto cpu_thread_count) {
1647  return SUFFIX(count_matches_sharded)(count_buff,
1648  invalid_slot_val,
1649  join_column,
1650  type_info,
1651  shard_info
1652 #ifndef __CUDACC__
1653  ,
1654  sd_inner_proxy,
1655  sd_outer_proxy,
1656  cpu_thread_idx,
1657  cpu_thread_count
1658 #endif
1659  );
1660  };
1661 
1662  auto launch_fill_row_ids = [buff,
1663  hash_entry_count,
1664  invalid_slot_val,
1665  &join_column,
1666  &type_info,
1667  &shard_info
1668 #ifndef __CUDACC__
1669  ,
1670  sd_inner_proxy,
1671  sd_outer_proxy
1672 #endif
1673  ](auto cpu_thread_idx, auto cpu_thread_count) {
1674  return SUFFIX(fill_row_ids_sharded)(buff,
1675  hash_entry_count,
1676  invalid_slot_val,
1677  join_column,
1678  type_info,
1679  shard_info
1680 #ifndef __CUDACC__
1681  ,
1682  sd_inner_proxy,
1683  sd_outer_proxy,
1684  cpu_thread_idx,
1685  cpu_thread_count);
1686 #endif
1687  };
1688 
1690  hash_entry_count,
1691  invalid_slot_val,
1692  join_column,
1693  type_info,
1694  shard_info
1695 #ifndef __CUDACC__
1696  ,
1697  sd_inner_proxy,
1698  sd_outer_proxy,
1699  cpu_thread_count
1700 #endif
1701  ,
1702  launch_count_matches,
1703  launch_fill_row_ids);
1704 }
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_launcher,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_launcher 
)

Definition at line 1568 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, i, and inclusive_scan().

Referenced by fill_one_to_many_hash_table_sharded().

1579  {
1580  int32_t* pos_buff = buff;
1581  int32_t* count_buff = buff + hash_entry_count;
1582  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1583  std::vector<std::future<void>> counter_threads;
1584  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1585  counter_threads.push_back(std::async(
1586  std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));
1587  }
1588 
1589  for (auto& child : counter_threads) {
1590  child.get();
1591  }
1592 
1593  std::vector<int32_t> count_copy(hash_entry_count, 0);
1594  CHECK_GT(hash_entry_count, int64_t(0));
1595  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1597  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1598  std::vector<std::future<void>> pos_threads;
1599  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1600  pos_threads.push_back(std::async(
1602  [&](const unsigned thread_idx) {
1603  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1604  if (count_buff[i]) {
1605  pos_buff[i] = count_copy[i];
1606  }
1607  }
1608  },
1609  cpu_thread_idx));
1610  }
1611  for (auto& child : pos_threads) {
1612  child.get();
1613  }
1614 
1615  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1616  std::vector<std::future<void>> rowid_threads;
1617  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1618  rowid_threads.push_back(std::async(
1619  std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));
1620  }
1621 
1622  for (auto& child : rowid_threads) {
1623  child.get();
1624  }
1625 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:221
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 925 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

937  {
938  auto slot_sel = [&type_info](auto pos_buff, auto elem) {
939  return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);
940  };
941 
942  fill_row_ids_impl(buff,
943  hash_entry_count,
944  invalid_slot_val,
945  join_column,
946  type_info
947 #ifndef __CUDACC__
948  ,
949  sd_inner_proxy,
950  sd_outer_proxy,
951  cpu_thread_idx,
952  cpu_thread_count
953 #endif
954  ,
955  slot_sel);
956 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:74
const int64_t min_val
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1136 of file HashJoinRuntime.cpp.

References CHECK_NE, g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and omnisci.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

1147  {
1148  int32_t* pos_buff = buff;
1149  int32_t* count_buff = buff + hash_entry_count;
1150  int32_t* id_buff = count_buff + hash_entry_count;
1151 #ifdef __CUDACC__
1152  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1153  int32_t step = blockDim.x * gridDim.x;
1154 #else
1155  int32_t start = cpu_thread_idx;
1156  int32_t step = cpu_thread_count;
1157 #endif
1158 
1159  T key_scratch_buff[g_maximum_conditions_to_coalesce];
1160 #ifdef __CUDACC__
1161  assert(composite_key_dict);
1162 #endif
1163  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
1164  auto key_buff_handler = [composite_key_dict,
1165  hash_entry_count,
1166  pos_buff,
1167  invalid_slot_val,
1168  count_buff,
1169  id_buff,
1170  key_size_in_bytes](const int64_t row_index,
1171  const T* key_scratch_buff,
1172  const size_t key_component_count) {
1173  const T* matching_group =
1175  key_component_count,
1176  composite_key_dict,
1177  hash_entry_count,
1178  key_size_in_bytes);
1179  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
1180  int32_t* pos_ptr = pos_buff + entry_idx;
1181 #ifndef __CUDACC__
1182  CHECK_NE(*pos_ptr, invalid_slot_val);
1183 #endif
1184  const auto bin_idx = pos_ptr - pos_buff;
1185  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1186  id_buff[id_buff_idx] = static_cast<int32_t>(row_index);
1187  return 0;
1188  };
1189 
1190  JoinColumnTuple cols(
1191  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1192  for (auto& it : cols.slice(start, step)) {
1193  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1194  }
1195  return;
1196 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
#define CHECK_NE(x, y)
Definition: Logger.h:218
char * f
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 958 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

971  {
972  auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {
974  pos_buff, elem, type_info.min_val, bucket_normalization);
975  };
976  fill_row_ids_impl(buff,
977  hash_entry_count,
978  invalid_slot_val,
979  join_column,
980  type_info
981 #ifndef __CUDACC__
982  ,
983  sd_inner_proxy,
984  sd_outer_proxy,
985  cpu_thread_idx,
986  cpu_thread_count
987 #endif
988  ,
989  slot_sel);
990 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
#define SUFFIX(name)
const int64_t min_val
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 866 of file HashJoinRuntime.cpp.

References CHECK_GE, CHECK_NE, StringDictionary::INVALID_STR_ID, mapd_add, and anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict().

Referenced by fill_row_ids(), fill_row_ids_bucketized(), fill_row_ids_sharded(), and fill_row_ids_sharded_bucketized().

879  {
880  int32_t* pos_buff = buff;
881  int32_t* count_buff = buff + hash_entry_count;
882  int32_t* id_buff = count_buff + hash_entry_count;
883 
884 #ifdef __CUDACC__
885  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
886  int32_t step = blockDim.x * gridDim.x;
887 #else
888  int32_t start = cpu_thread_idx;
889  int32_t step = cpu_thread_count;
890 #endif
891  JoinColumnTyped col{&join_column, &type_info};
892  for (auto item : col.slice(start, step)) {
893  const size_t index = item.index;
894  int64_t elem = item.element;
895  if (elem == type_info.null_val) {
896  if (type_info.uses_bw_eq) {
897  elem = type_info.translated_null_val;
898  } else {
899  continue;
900  }
901  }
902 #ifndef __CUDACC__
903  if (sd_inner_proxy &&
904  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
905  const auto outer_id = translate_str_id_to_outer_dict(
906  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
907  if (outer_id == StringDictionary::INVALID_STR_ID) {
908  continue;
909  }
910  elem = outer_id;
911  }
912  CHECK_GE(elem, type_info.min_val)
913  << "Element " << elem << " less than min val " << type_info.min_val;
914 #endif
915  auto pos_ptr = slot_selector(pos_buff, elem);
916 #ifndef __CUDACC__
917  CHECK_NE(*pos_ptr, invalid_slot_val);
918 #endif
919  const auto bin_idx = pos_ptr - pos_buff;
920  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
921  id_buff[id_buff_idx] = static_cast<int32_t>(index);
922  }
923 }
#define CHECK_GE(x, y)
Definition: Logger.h:222
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define CHECK_NE(x, y)
Definition: Logger.h:218
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1054 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot_sharded(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

1067  {
1068  auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {
1069  return SUFFIX(get_hash_slot_sharded)(pos_buff,
1070  elem,
1071  type_info.min_val,
1072  shard_info.entry_count_per_shard,
1073  shard_info.num_shards,
1074  shard_info.device_count);
1075  };
1076 
1077  fill_row_ids_impl(buff,
1078  hash_entry_count,
1079  invalid_slot_val,
1080  join_column,
1081  type_info
1082 #ifndef __CUDACC__
1083  ,
1084  sd_inner_proxy,
1085  sd_outer_proxy,
1086  cpu_thread_idx,
1087  cpu_thread_count
1088 #endif
1089  ,
1090  slot_sel);
1091 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:95
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 1093 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot_sharded(), and SUFFIX.

1107  {
1108  auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,
1109  auto elem) {
1110  return SUFFIX(get_bucketized_hash_slot_sharded)(pos_buff,
1111  elem,
1112  type_info.min_val,
1113  shard_info.entry_count_per_shard,
1114  shard_info.num_shards,
1115  shard_info.device_count,
1116  bucket_normalization);
1117  };
1118 
1119  fill_row_ids_impl(buff,
1120  hash_entry_count,
1121  invalid_slot_val,
1122  join_column,
1123  type_info
1124 #ifndef __CUDACC__
1125  ,
1126  sd_inner_proxy,
1127  sd_outer_proxy,
1128  cpu_thread_idx,
1129  cpu_thread_count
1130 #endif
1131  ,
1132  slot_sel);
1133 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:80
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 993 of file HashJoinRuntime.cpp.

References CHECK_GE, CHECK_NE, StringDictionary::INVALID_STR_ID, mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict(), JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

1007  {
1008 
1009  int32_t* pos_buff = buff;
1010  int32_t* count_buff = buff + hash_entry_count;
1011  int32_t* id_buff = count_buff + hash_entry_count;
1012 
1013 #ifdef __CUDACC__
1014  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1015  int32_t step = blockDim.x * gridDim.x;
1016 #else
1017  int32_t start = cpu_thread_idx;
1018  int32_t step = cpu_thread_count;
1019 #endif
1020  JoinColumnTyped col{&join_column, &type_info};
1021  for (auto item : col.slice(start, step)) {
1022  const size_t index = item.index;
1023  int64_t elem = item.element;
1024  if (elem == type_info.null_val) {
1025  if (type_info.uses_bw_eq) {
1026  elem = type_info.translated_null_val;
1027  } else {
1028  continue;
1029  }
1030  }
1031 #ifndef __CUDACC__
1032  if (sd_inner_proxy &&
1033  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
1034  const auto outer_id = translate_str_id_to_outer_dict(
1035  elem, type_info.min_val, type_info.max_val, sd_inner_proxy, sd_outer_proxy);
1036  if (outer_id == StringDictionary::INVALID_STR_ID) {
1037  continue;
1038  }
1039  elem = outer_id;
1040  }
1041  CHECK_GE(elem, type_info.min_val)
1042  << "Element " << elem << " less than min val " << type_info.min_val;
1043 #endif
1044  auto* pos_ptr = slot_selector(pos_buff, elem);
1045 #ifndef __CUDACC__
1046  CHECK_NE(*pos_ptr, invalid_slot_val);
1047 #endif
1048  const auto bin_idx = pos_ptr - pos_buff;
1049  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1050  id_buff[id_buff_idx] = static_cast<int32_t>(index);
1051  }
1052 }
#define CHECK_GE(x, y)
Definition: Logger.h:222
int64_t translate_str_id_to_outer_dict(const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
#define CHECK_NE(x, y)
Definition: Logger.h:218
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

template<typename T >
T* get_matching_baseline_hash_slot_at ( int8_t *  hash_buff,
const uint32_t  h,
const T *  key,
const size_t  key_component_count,
const int64_t  hash_entry_size 
)

Definition at line 438 of file HashJoinRuntime.cpp.

References cas_cst, get_invalid_key(), i, load_cst, store_cst, SUFFIX, omnisci.dtypes::T, and UNLIKELY.

Referenced by write_baseline_hash_slot(), and write_baseline_hash_slot_for_semi_join().

442  {
443  uint32_t off = h * hash_entry_size;
444  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
445  T empty_key = SUFFIX(get_invalid_key)<T>();
446  T write_pending = SUFFIX(get_invalid_key)<T>() - 1;
447  if (UNLIKELY(*key == write_pending)) {
448  // Address the singularity case where the first column contains the pending
449  // write special value. Should never happen, but avoid doing wrong things.
450  return nullptr;
451  }
452  const bool success = cas_cst(row_ptr, &empty_key, write_pending);
453  if (success) {
454  if (key_component_count > 1) {
455  memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));
456  }
457  store_cst(row_ptr, *key);
458  return reinterpret_cast<T*>(row_ptr + key_component_count);
459  }
460  while (load_cst(row_ptr) == write_pending) {
461  // spin until the winning thread has finished writing the entire key
462  }
463  for (size_t i = 0; i < key_component_count; ++i) {
464  if (load_cst(row_ptr + i) != key[i]) {
465  return nullptr;
466  }
467  }
468  return reinterpret_cast<T*>(row_ptr + key_component_count);
469 }
#define SUFFIX(name)
#define load_cst(ptr)
DEVICE T SUFFIX() get_invalid_key()
#define cas_cst(ptr, expected, desired)
#define UNLIKELY(x)
Definition: likely.h:25
#define store_cst(ptr, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE NEVER_INLINE const T* SUFFIX() get_matching_baseline_hash_slot_readonly ( const T *  key,
const size_t  key_component_count,
const T *  composite_key_dict,
const int64_t  entry_count,
const size_t  key_size_in_bytes 
)

Definition at line 790 of file HashJoinRuntime.cpp.

References CHECK, keys_are_equal(), and MurmurHash1Impl().

Referenced by count_matches_baseline(), and fill_row_ids_baseline().

795  {
796  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
797  uint32_t off = h * key_component_count;
798  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
799  return &composite_key_dict[off];
800  }
801  uint32_t h_probe = (h + 1) % entry_count;
802  while (h_probe != h) {
803  off = h_probe * key_component_count;
804  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
805  return &composite_key_dict[off];
806  }
807  h_probe = (h_probe + 1) % entry_count;
808  }
809 #ifndef __CUDACC__
810  CHECK(false);
811 #else
812  assert(false);
813 #endif
814  return nullptr;
815 }
bool keys_are_equal(const T *key1, const T *key2, const size_t key_component_count)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename InputIterator , typename OutputIterator >
void inclusive_scan ( InputIterator  first,
InputIterator  last,
OutputIterator  out,
const size_t  thread_count 
)

Definition at line 1314 of file HashJoinRuntime.cpp.

References threading_serial::async(), and gpu_enabled::partial_sum().

Referenced by approximate_distinct_tuples_on_device_overlaps(), approximate_distinct_tuples_on_device_range(), approximate_distinct_tuples_overlaps(), approximate_distinct_tuples_range(), fill_one_to_many_baseline_hash_table(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_impl(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), fill_one_to_many_hash_table_sharded_impl(), and gpu_enabled::partial_sum().

1317  {
1318  using ElementType = typename InputIterator::value_type;
1319  using OffsetType = typename InputIterator::difference_type;
1320  const OffsetType elem_count = last - first;
1321  if (elem_count < 10000 || thread_count <= 1) {
1322  ElementType sum = 0;
1323  for (auto iter = first; iter != last; ++iter, ++out) {
1324  *out = sum += *iter;
1325  }
1326  return;
1327  }
1328 
1329  const OffsetType step = (elem_count + thread_count - 1) / thread_count;
1330  OffsetType start_off = 0;
1331  OffsetType end_off = std::min(step, elem_count);
1332  std::vector<ElementType> partial_sums(thread_count);
1333  std::vector<std::future<void>> counter_threads;
1334  for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,
1335  start_off = std::min(start_off + step, elem_count),
1336  end_off = std::min(start_off + step, elem_count)) {
1337  counter_threads.push_back(std::async(
1339  [first, out](
1340  ElementType& partial_sum, const OffsetType start, const OffsetType end) {
1341  ElementType sum = 0;
1342  for (auto in_iter = first + start, out_iter = out + start;
1343  in_iter != (first + end);
1344  ++in_iter, ++out_iter) {
1345  *out_iter = sum += *in_iter;
1346  }
1347  partial_sum = sum;
1348  },
1349  std::ref(partial_sums[thread_idx]),
1350  start_off,
1351  end_off));
1352  }
1353  for (auto& child : counter_threads) {
1354  child.get();
1355  }
1356 
1357  ElementType sum = 0;
1358  for (auto& s : partial_sums) {
1359  s += sum;
1360  sum = s;
1361  }
1362 
1363  counter_threads.clear();
1364  start_off = std::min(step, elem_count);
1365  end_off = std::min(start_off + step, elem_count);
1366  for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,
1367  start_off = std::min(start_off + step, elem_count),
1368  end_off = std::min(start_off + step, elem_count)) {
1369  counter_threads.push_back(std::async(
1371  [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {
1372  for (auto iter = out + start; iter != (out + end); ++iter) {
1373  *iter += prev_sum;
1374  }
1375  },
1376  partial_sums[thread_idx],
1377  start_off,
1378  end_off));
1379  }
1380  for (auto& child : counter_threads) {
1381  child.get();
1382  }
1383 }
future< Result > async(Fn &&fn, Args &&...args)
DEVICE void partial_sum(ARGS &&...args)
Definition: gpu_enabled.h:87

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 352 of file HashJoinRuntime.cpp.

References get_invalid_key(), i, SUFFIX, and omnisci.dtypes::T.

Referenced by init_baseline_hash_join_buff_wrapper().

358  {
359 #ifdef __CUDACC__
360  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
361  int32_t step = blockDim.x * gridDim.x;
362 #else
363  int32_t start = cpu_thread_idx;
364  int32_t step = cpu_thread_count;
365 #endif
366  auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
367  const T empty_key = SUFFIX(get_invalid_key)<T>();
368  for (int64_t h = start; h < entry_count; h += step) {
369  int64_t off = h * hash_entry_size;
370  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
371  for (size_t i = 0; i < key_component_count; ++i) {
372  row_ptr[i] = empty_key;
373  }
374  if (with_val_slot) {
375  row_ptr[key_component_count] = invalid_slot_val;
376  }
377  }
378 }
#define SUFFIX(name)
DEVICE T SUFFIX() get_invalid_key()

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1706 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1712  {
1713  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1714  entry_count,
1715  key_component_count,
1716  with_val_slot,
1717  invalid_slot_val,
1718  cpu_thread_idx,
1719  cpu_thread_count);
1720 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1722 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1728  {
1729  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1730  entry_count,
1731  key_component_count,
1732  with_val_slot,
1733  invalid_slot_val,
1734  cpu_thread_idx,
1735  cpu_thread_count);
1736 }

+ Here is the caller graph for this function:

DEVICE void SUFFIX() init_hash_join_buff ( int32_t *  groups_buffer,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 92 of file HashJoinRuntime.cpp.

References i.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

96  {
97 #ifdef __CUDACC__
98  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
99  int32_t step = blockDim.x * gridDim.x;
100 #else
101  int32_t start = cpu_thread_idx;
102  int32_t step = cpu_thread_count;
103 #endif
104  for (int64_t i = start; i < hash_entry_count; i += step) {
105  groups_buffer[i] = invalid_slot_val;
106  }
107 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1760 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1768  {
1769  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1770  entry_count,
1771  invalid_slot_val,
1772  false,
1773  key_component_count,
1774  with_val_slot,
1775  key_handler,
1776  num_elems,
1777  cpu_thread_idx,
1778  cpu_thread_count);
1779 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1824 of file HashJoinRuntime.cpp.

1832  {
1833  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1834  entry_count,
1835  invalid_slot_val,
1836  false,
1837  key_component_count,
1838  with_val_slot,
1839  key_handler,
1840  num_elems,
1841  cpu_thread_idx,
1842  cpu_thread_count);
1843 }
int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1781 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1789  {
1790  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1791  entry_count,
1792  invalid_slot_val,
1793  false,
1794  key_component_count,
1795  with_val_slot,
1796  key_handler,
1797  num_elems,
1798  cpu_thread_idx,
1799  cpu_thread_count);
1800 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1845 of file HashJoinRuntime.cpp.

1853  {
1854  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1855  entry_count,
1856  invalid_slot_val,
1857  false,
1858  key_component_count,
1859  with_val_slot,
1860  key_handler,
1861  num_elems,
1862  cpu_thread_idx,
1863  cpu_thread_count);
1864 }
template<typename T >
DEVICE int write_baseline_hash_slot ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 478 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and omnisci.dtypes::T.

486  {
487  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
488  T* matching_group = get_matching_baseline_hash_slot_at(
489  hash_buff, h, key, key_component_count, hash_entry_size);
490  if (!matching_group) {
491  uint32_t h_probe = (h + 1) % entry_count;
492  while (h_probe != h) {
493  matching_group = get_matching_baseline_hash_slot_at(
494  hash_buff, h_probe, key, key_component_count, hash_entry_size);
495  if (matching_group) {
496  break;
497  }
498  h_probe = (h_probe + 1) % entry_count;
499  }
500  }
501  if (!matching_group) {
502  return -2;
503  }
504  if (!with_val_slot) {
505  return 0;
506  }
507  if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {
508  return -1;
509  }
510  return 0;
511 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function:

template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 514 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and omnisci.dtypes::T.

522  {
523  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
524  T* matching_group = get_matching_baseline_hash_slot_at(
525  hash_buff, h, key, key_component_count, hash_entry_size);
526  if (!matching_group) {
527  uint32_t h_probe = (h + 1) % entry_count;
528  while (h_probe != h) {
529  matching_group = get_matching_baseline_hash_slot_at(
530  hash_buff, h_probe, key, key_component_count, hash_entry_size);
531  if (matching_group) {
532  break;
533  }
534  h_probe = (h_probe + 1) % entry_count;
535  }
536  }
537  if (!matching_group) {
538  return -2;
539  }
540  if (!with_val_slot) {
541  return 0;
542  }
543  mapd_cas(matching_group, invalid_slot_val, val);
544  return 0;
545 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function: