OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.cpp File Reference
+ Include dependency graph for HashJoinRuntime.cpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

 anonymous_namespace{HashJoinRuntime.cpp}
 

Macros

#define mapd_cas(address, compare, val)   __sync_val_compare_and_swap(address, compare, val)
 
#define cas_cst(ptr, expected, desired)
 
#define store_cst(ptr, val)   __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)
 
#define load_cst(ptr)   __atomic_load_n(ptr, __ATOMIC_SEQ_CST)
 
#define mapd_add(address, val)   __sync_fetch_and_add(address, val)
 

Functions

int64_t anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict (const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
 
DEVICE void SUFFIX() init_hash_join_buff (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl (int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl (int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
T * get_matching_baseline_hash_slot_at (int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() count_matches (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() count_matches_bucketized (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
GLOBAL void SUFFIX() count_matches_sharded (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE NEVER_INLINE const T
*SUFFIX() 
get_matching_baseline_hash_slot_readonly (const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline (int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_bucketized (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids_sharded (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *f, const int64_t num_elems, const bool for_window_framing, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl (uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl (double *bucket_sizes_for_thread, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename InputIterator , typename OutputIterator >
void inclusive_scan (InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, const bool for_window_framing)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
 
void fill_one_to_many_hash_table_sharded (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
void fill_one_to_many_baseline_hash_table (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const size_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 

Macro Definition Documentation

#define cas_cst (   ptr,
  expected,
  desired 
)
Value:
__atomic_compare_exchange_n( \
ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

Definition at line 465 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define load_cst (   ptr)    __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

Definition at line 469 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define mapd_add (   address,
  val 
)    __sync_fetch_and_add(address, val)
#define mapd_cas (   address,
  compare,
  val 
)    __sync_val_compare_and_swap(address, compare, val)
#define store_cst (   ptr,
  val 
)    __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

Definition at line 468 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2196 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2201  {
2202  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2203  CHECK(!join_column_per_key.empty());
2204 
2205  std::vector<std::future<void>> approx_distinct_threads;
2206  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2207  approx_distinct_threads.push_back(std::async(
2209  [&join_column_per_key,
2210  &type_info_per_key,
2211  b,
2212  hll_buffer_all_cpus,
2213  padded_size_bytes,
2214  thread_idx,
2215  thread_count] {
2216  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2217 
2218  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2219  false,
2220  &join_column_per_key[0],
2221  &type_info_per_key[0],
2222  nullptr,
2223  nullptr);
2225  nullptr,
2226  b,
2227  join_column_per_key[0].num_elems,
2228  &key_handler,
2229  thread_idx,
2230  thread_count);
2231  }));
2232  }
2233  for (auto& child : approx_distinct_threads) {
2234  child.get();
2235  }
2236 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl ( uint8_t *  hll_buffer,
int32_t *  row_count_buffer,
const uint32_t  b,
const int64_t  num_elems,
const KEY_HANDLER *  f,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1243 of file HashJoinRuntime.cpp.

References atomicMax(), g_maximum_conditions_to_coalesce, get_rank(), and MurmurHash64AImpl().

Referenced by approximate_distinct_tuples(), approximate_distinct_tuples_overlaps(), and approximate_distinct_tuples_range().

1253  {
1254 #ifdef __CUDACC__
1255  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1256  int32_t step = blockDim.x * gridDim.x;
1257 #else
1258  int32_t start = cpu_thread_idx;
1259  int32_t step = cpu_thread_count;
1260 #endif
1261 
1262  auto key_buff_handler = [b, hll_buffer, row_count_buffer](
1263  const int64_t entry_idx,
1264  const int64_t* key_scratch_buff,
1265  const size_t key_component_count) {
1266  if (row_count_buffer) {
1267  row_count_buffer[entry_idx] += 1;
1268  }
1269 
1270  const uint64_t hash =
1271  MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);
1272  const uint32_t index = hash >> (64 - b);
1273  const auto rank = get_rank(hash << b, 64 - b);
1274 #ifdef __CUDACC__
1275  atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);
1276 #else
1277  hll_buffer[index] = std::max(hll_buffer[index], rank);
1278 #endif
1279 
1280  return 0;
1281  };
1282 
1283  int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];
1284 
1285  JoinColumnTuple cols(
1286  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1287  for (auto& it : cols.slice(start, step)) {
1288  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1289  }
1290 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
constexpr double f
Definition: Utm.h:31
__device__ double atomicMax(double *address, double val)
const size_t g_maximum_conditions_to_coalesce
FORCE_INLINE DEVICE uint64_t MurmurHash64AImpl(const void *key, int len, uint64_t seed)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2238 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

2246  {
2247  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2248  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2249  CHECK(!join_column_per_key.empty());
2250 
2251  std::vector<std::future<void>> approx_distinct_threads;
2252  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2253  approx_distinct_threads.push_back(std::async(
2255  [&join_column_per_key,
2256  &join_buckets_per_key,
2257  &row_counts,
2258  b,
2259  hll_buffer_all_cpus,
2260  padded_size_bytes,
2261  thread_idx,
2262  thread_count] {
2263  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2264 
2265  const auto key_handler = OverlapsKeyHandler(
2266  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2267  &join_column_per_key[0],
2268  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2270  row_counts.data(),
2271  b,
2272  join_column_per_key[0].num_elems,
2273  &key_handler,
2274  thread_idx,
2275  thread_count);
2276  }));
2277  }
2278  for (auto& child : approx_distinct_threads) {
2279  child.get();
2280  }
2281 
2283  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2284 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2286 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2295  {
2296  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2297  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2298  CHECK(!join_column_per_key.empty());
2299 
2300  std::vector<std::future<void>> approx_distinct_threads;
2301  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2302  approx_distinct_threads.push_back(std::async(
2304  [&join_column_per_key,
2305  &join_buckets_per_key,
2306  &row_counts,
2307  b,
2308  hll_buffer_all_cpus,
2309  padded_size_bytes,
2310  thread_idx,
2311  is_compressed,
2312  thread_count] {
2313  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2314 
2315  const auto key_handler = RangeKeyHandler(
2316  is_compressed,
2317  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2318  &join_column_per_key[0],
2319  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2321  row_counts.data(),
2322  b,
2323  join_column_per_key[0].num_elems,
2324  &key_handler,
2325  thread_idx,
2326  thread_count);
2327  }));
2328  }
2329  for (auto& child : approx_distinct_threads) {
2330  child.get();
2331  }
2332 
2334  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2335 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl ( double *  bucket_sizes_for_thread,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1313 of file HashJoinRuntime.cpp.

References atomicMin(), fixed_width_double_decode_noinline(), anonymous_namespace{Utm.h}::N, JoinColumnIterator::ptr(), and SUFFIX.

1322  {
1323 #ifdef __CUDACC__
1324  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1325  int32_t step = blockDim.x * gridDim.x;
1326 #else
1327  int32_t start = cpu_thread_idx;
1328  int32_t step = cpu_thread_count;
1329 #endif
1330  JoinColumnIterator it(join_column, type_info, start, step);
1331  for (; it; ++it) {
1332  // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)
1333  double bounds[2 * N];
1334  for (size_t j = 0; j < 2 * N; j++) {
1335  bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);
1336  }
1337 
1338  for (size_t j = 0; j < N; j++) {
1339  const auto diff = bounds[j + N] - bounds[j];
1340 #ifdef __CUDACC__
1341  if (diff > bucket_size_thresholds[j]) {
1342  atomicMin(&bucket_sizes_for_thread[j], diff);
1343  }
1344 #else
1345  if (diff < bucket_size_thresholds[j] && diff > bucket_sizes_for_thread[j]) {
1346  bucket_sizes_for_thread[j] = diff;
1347  }
1348 #endif
1349  }
1350  }
1351 }
__device__ double atomicMin(double *address, double val)
#define SUFFIX(name)
Iterates over the rows of a JoinColumn across multiple fragments/chunks.
constexpr unsigned N
Definition: Utm.h:110
DEVICE NEVER_INLINE double SUFFIX() fixed_width_double_decode_noinline(const int8_t *byte_stream, const int64_t pos)
Definition: DecodersImpl.h:134

+ Here is the call graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2337 of file HashJoinRuntime.cpp.

References threading_serial::async().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

2341  {
2342  std::vector<std::vector<double>> bucket_sizes_for_threads;
2343  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2344  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2345  }
2346  std::vector<std::future<void>> threads;
2347  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2348  threads.push_back(std::async(std::launch::async,
2349  compute_bucket_sizes_impl<2>,
2350  bucket_sizes_for_threads[thread_idx].data(),
2351  &join_column,
2352  &type_info,
2353  bucket_size_thresholds.data(),
2354  thread_idx,
2355  thread_count));
2356  }
2357  for (auto& child : threads) {
2358  child.get();
2359  }
2360 
2361  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2362  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2363  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2364  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2365  }
2366  }
2367  }
2368 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 709 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

719  {
720  auto slot_sel = [&type_info](auto count_buff, auto elem) {
721  return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);
722  };
723  count_matches_impl(count_buff,
724  join_column,
725  type_info
726 #ifndef __CUDACC__
727  ,
728  sd_inner_to_outer_translation_map,
729  min_inner_elem,
730  cpu_thread_idx,
731  cpu_thread_count
732 #endif
733  ,
734  slot_sel);
735 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline ( int32_t *  count_buff,
const T *  composite_key_dict,
const int64_t  entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 854 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and heavydb.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

864  {
865 #ifdef __CUDACC__
866  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
867  int32_t step = blockDim.x * gridDim.x;
868 #else
869  int32_t start = cpu_thread_idx;
870  int32_t step = cpu_thread_count;
871 #endif
872 #ifdef __CUDACC__
873  assert(composite_key_dict);
874 #endif
875  T key_scratch_buff[g_maximum_conditions_to_coalesce];
876  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
877  auto key_buff_handler = [composite_key_dict,
878  entry_count,
879  count_buff,
880  key_size_in_bytes](const int64_t row_entry_idx,
881  const T* key_scratch_buff,
882  const size_t key_component_count) {
883  const auto matching_group =
885  key_component_count,
886  composite_key_dict,
887  entry_count,
888  key_size_in_bytes);
889  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
890  mapd_add(&count_buff[entry_idx], int32_t(1));
891  return 0;
892  };
893 
894  JoinColumnTuple cols(
895  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
896  for (auto& it : cols.slice(start, step)) {
897  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
898  }
899 }
#define SUFFIX(name)
constexpr double f
Definition: Utm.h:31
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_bucketized ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 737 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

749  {
750  auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {
751  return SUFFIX(get_bucketized_hash_slot)(count_buff,
752  elem,
753  type_info.min_val / bucket_normalization,
754  type_info.translated_null_val,
755  bucket_normalization);
756  };
757  count_matches_impl(count_buff,
758  join_column,
759  type_info
760 #ifndef __CUDACC__
761  ,
762  sd_inner_to_outer_translation_map,
763  min_inner_elem,
764  cpu_thread_idx,
765  cpu_thread_count
766 #endif
767  ,
768  slot_sel);
769 }
#define SUFFIX(name)
const int64_t translated_null_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 661 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by count_matches(), and count_matches_bucketized().

672  {
673 #ifdef __CUDACC__
674  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
675  int32_t step = blockDim.x * gridDim.x;
676 #else
677  int32_t start = cpu_thread_idx;
678  int32_t step = cpu_thread_count;
679 #endif
680  JoinColumnTyped col{&join_column, &type_info};
681  for (auto item : col.slice(start, step)) {
682  int64_t elem = item.element;
683  if (elem == type_info.null_val) {
684  if (type_info.uses_bw_eq) {
685  elem = type_info.translated_null_val;
686  } else {
687  continue;
688  }
689  }
690 #ifndef __CUDACC__
691  if (sd_inner_to_outer_translation_map &&
692  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
693  const auto outer_id = map_str_id_to_outer_dict(elem,
694  min_inner_elem,
695  type_info.min_val,
696  type_info.max_val,
697  sd_inner_to_outer_translation_map);
698  if (outer_id == StringDictionary::INVALID_STR_ID) {
699  continue;
700  }
701  elem = outer_id;
702  }
703 #endif
704  auto* entry_ptr = slot_selector(count_buff, elem);
705  mapd_add(entry_ptr, int32_t(1));
706  }
707 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_sharded ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 771 of file HashJoinRuntime.cpp.

References get_hash_slot_sharded(), StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

783  {
784 #ifdef __CUDACC__
785  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
786  int32_t step = blockDim.x * gridDim.x;
787 #else
788  int32_t start = cpu_thread_idx;
789  int32_t step = cpu_thread_count;
790 #endif
791  JoinColumnTyped col{&join_column, &type_info};
792  for (auto item : col.slice(start, step)) {
793  int64_t elem = item.element;
794  if (elem == type_info.null_val) {
795  if (type_info.uses_bw_eq) {
796  elem = type_info.translated_null_val;
797  } else {
798  continue;
799  }
800  }
801 #ifndef __CUDACC__
802  if (sd_inner_to_outer_translation_map &&
803  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
804  const auto outer_id = map_str_id_to_outer_dict(elem,
805  min_inner_elem,
806  type_info.min_val,
807  type_info.max_val,
808  sd_inner_to_outer_translation_map);
809  if (outer_id == StringDictionary::INVALID_STR_ID) {
810  continue;
811  }
812  elem = outer_id;
813  }
814 #endif
815  int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,
816  elem,
817  type_info.min_val,
818  shard_info.entry_count_per_shard,
819  shard_info.num_shards,
820  shard_info.device_count);
821  mapd_add(entry_ptr, int32_t(1));
822  }
823 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const int64_t null_val
const size_t entry_count_per_shard
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:100
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const FILL_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 583 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, and heavydb.dtypes::T.

592  {
593 #ifdef __CUDACC__
594  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
595  int32_t step = blockDim.x * gridDim.x;
596 #else
597  int32_t start = cpu_thread_idx;
598  int32_t step = cpu_thread_count;
599 #endif
600 
601  T key_scratch_buff[g_maximum_conditions_to_coalesce];
602  const size_t key_size_in_bytes = key_component_count * sizeof(T);
603  const size_t hash_entry_size =
604  (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
605  auto key_buff_handler = [hash_buff,
606  entry_count,
607  with_val_slot,
608  invalid_slot_val,
609  key_size_in_bytes,
610  hash_entry_size,
611  &for_semi_join](const int64_t entry_idx,
612  const T* key_scratch_buffer,
613  const size_t key_component_count) {
614  if (for_semi_join) {
615  return write_baseline_hash_slot_for_semi_join<T>(entry_idx,
616  hash_buff,
617  entry_count,
618  key_scratch_buffer,
619  key_component_count,
620  with_val_slot,
621  invalid_slot_val,
622  key_size_in_bytes,
623  hash_entry_size);
624  } else {
625  return write_baseline_hash_slot<T>(entry_idx,
626  hash_buff,
627  entry_count,
628  key_scratch_buffer,
629  key_component_count,
630  with_val_slot,
631  invalid_slot_val,
632  key_size_in_bytes,
633  hash_entry_size);
634  }
635  };
636 
637  JoinColumnTuple cols(
638  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
639  for (auto& it : cols.slice(start, step)) {
640  const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
641  if (err) {
642  return err;
643  }
644  }
645  return 0;
646 }
constexpr double f
Definition: Utm.h:31
const size_t g_maximum_conditions_to_coalesce
int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1795 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1804  {
1805  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1806  entry_count,
1807  invalid_slot_val,
1808  for_semi_join,
1809  key_component_count,
1810  with_val_slot,
1811  key_handler,
1812  num_elems,
1813  cpu_thread_idx,
1814  cpu_thread_count);
1815 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1859 of file HashJoinRuntime.cpp.

1868  {
1869  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1870  entry_count,
1871  invalid_slot_val,
1872  for_semi_join,
1873  key_component_count,
1874  with_val_slot,
1875  key_handler,
1876  num_elems,
1877  cpu_thread_idx,
1878  cpu_thread_count);
1879 }
DEVICE int SUFFIX() fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 199 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

207  {
208  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
209  : SUFFIX(fill_one_to_one_hashtable);
210  auto hashtable_filling_func = [&](auto elem, size_t index) {
211  auto entry_ptr = SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
212  return filling_func(index, entry_ptr, invalid_slot_val);
213  };
214 
215  return fill_hash_join_buff_impl(buff,
216  join_column,
217  type_info,
218  sd_inner_to_outer_translation_map,
219  min_inner_elem,
220  cpu_thread_idx,
221  cpu_thread_count,
222  hashtable_filling_func);
223 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 166 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

176  {
177  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
178  : SUFFIX(fill_one_to_one_hashtable);
179  auto hashtable_filling_func = [&](auto elem, size_t index) {
180  auto entry_ptr =
182  elem,
183  type_info.min_val / bucket_normalization,
184  type_info.translated_null_val,
185  bucket_normalization);
186  return filling_func(index, entry_ptr, invalid_slot_val);
187  };
188 
189  return fill_hash_join_buff_impl(buff,
190  join_column,
191  type_info,
192  sd_inner_to_outer_translation_map,
193  min_inner_elem,
194  cpu_thread_idx,
195  cpu_thread_count,
196  hashtable_filling_func);
197 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
const int64_t translated_null_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl ( int32_t *  buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 119 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff(), and fill_hash_join_buff_bucketized().

126  {
127 #ifdef __CUDACC__
128  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
129  int32_t step = blockDim.x * gridDim.x;
130 #else
131  int32_t start = cpu_thread_idx;
132  int32_t step = cpu_thread_count;
133 #endif
134  JoinColumnTyped col{&join_column, &type_info};
135  for (auto item : col.slice(start, step)) {
136  const size_t index = item.index;
137  int64_t elem = item.element;
138  if (elem == type_info.null_val) {
139  if (type_info.uses_bw_eq) {
140  elem = type_info.translated_null_val;
141  } else {
142  continue;
143  }
144  }
145 #ifndef __CUDACC__
146  if (sd_inner_to_outer_translation_map &&
147  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
148  const auto outer_id = map_str_id_to_outer_dict(elem,
149  min_inner_elem,
150  type_info.min_val,
151  type_info.max_val,
152  sd_inner_to_outer_translation_map);
153  if (outer_id == StringDictionary::INVALID_STR_ID) {
154  continue;
155  }
156  elem = outer_id;
157  }
158 #endif
159  if (filling_func(elem, index)) {
160  return -1;
161  }
162  }
163  return 0;
164 };
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 318 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded().

328  {
329  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
330  : SUFFIX(fill_one_to_one_hashtable);
331  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
332  auto entry_ptr = SUFFIX(get_hash_slot_sharded_opt)(buff,
333  elem,
334  type_info.min_val,
335  shard_info.entry_count_per_shard,
336  shard,
337  shard_info.num_shards,
338  shard_info.device_count);
339  return filling_func(index, entry_ptr, invalid_slot_val);
340  };
341 
343  join_column,
344  type_info,
345  shard_info,
346  sd_inner_to_outer_translation_map,
347  min_inner_elem,
348  cpu_thread_idx,
349  cpu_thread_count,
350  hashtable_filling_func);
351 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
const size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:132
const int64_t min_val
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 279 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded_bucketized().

290  {
291  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
292  : SUFFIX(fill_one_to_one_hashtable);
293  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
295  buff,
296  elem,
297  type_info.min_val / bucket_normalization,
298  type_info.translated_null_val,
299  shard_info.entry_count_per_shard,
300  shard,
301  shard_info.num_shards,
302  shard_info.device_count,
303  bucket_normalization);
304  return filling_func(index, entry_ptr, invalid_slot_val);
305  };
306 
308  join_column,
309  type_info,
310  shard_info,
311  sd_inner_to_outer_translation_map,
312  min_inner_elem,
313  cpu_thread_idx,
314  cpu_thread_count,
315  hashtable_filling_func);
316 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
const size_t entry_count_per_shard
const int64_t translated_null_val
const int64_t min_val
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:114

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl ( int32_t *  buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 226 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, ShardInfo::num_shards, ShardInfo::shard, SHARD_FOR_KEY, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_sharded(), and fill_hash_join_buff_sharded_bucketized().

235  {
236 #ifdef __CUDACC__
237  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
238  int32_t step = blockDim.x * gridDim.x;
239 #else
240  int32_t start = cpu_thread_idx;
241  int32_t step = cpu_thread_count;
242 #endif
243  JoinColumnTyped col{&join_column, &type_info};
244  for (auto item : col.slice(start, step)) {
245  const size_t index = item.index;
246  int64_t elem = item.element;
247  size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);
248  if (shard != shard_info.shard) {
249  continue;
250  }
251  if (elem == type_info.null_val) {
252  if (type_info.uses_bw_eq) {
253  elem = type_info.translated_null_val;
254  } else {
255  continue;
256  }
257  }
258 #ifndef __CUDACC__
259  if (sd_inner_to_outer_translation_map &&
260  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
261  const auto outer_id = map_str_id_to_outer_dict(elem,
262  min_inner_elem,
263  type_info.min_val,
264  type_info.max_val,
265  sd_inner_to_outer_translation_map);
266  if (outer_id == StringDictionary::INVALID_STR_ID) {
267  continue;
268  }
269  elem = outer_id;
270  }
271 #endif
272  if (filling_func(elem, shard, index)) {
273  return -1;
274  }
275  }
276  return 0;
277 }
const size_t num_shards
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const size_t shard
const int64_t max_val
const int64_t min_val
#define SHARD_FOR_KEY(key, num_shards)
Definition: shard_key.h:20

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
void fill_one_to_many_baseline_hash_table ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const size_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed,
const bool  for_window_framing 
)

Definition at line 1924 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, count_matches_baseline(), fill_row_ids_baseline(), inclusive_scan(), and SUFFIX.

1937  {
1938  int32_t* pos_buff = buff;
1939  int32_t* count_buff = buff + hash_entry_count;
1940  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1941  std::vector<std::future<void>> counter_threads;
1942  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1943  if (is_range_join) {
1944  counter_threads.push_back(std::async(
1946  [count_buff,
1947  composite_key_dict,
1948  &hash_entry_count,
1949  &join_buckets_per_key,
1950  &join_column_per_key,
1951  &is_geo_compressed,
1952  cpu_thread_idx,
1953  cpu_thread_count] {
1954  const auto key_handler = RangeKeyHandler(
1955  is_geo_compressed,
1956  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1957  &join_column_per_key[0],
1958  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1959  count_matches_baseline(count_buff,
1960  composite_key_dict,
1961  hash_entry_count,
1962  &key_handler,
1963  join_column_per_key[0].num_elems,
1964  cpu_thread_idx,
1965  cpu_thread_count);
1966  }));
1967  } else if (join_buckets_per_key.size() > 0) {
1968  counter_threads.push_back(std::async(
1970  [count_buff,
1971  composite_key_dict,
1972  &hash_entry_count,
1973  &join_buckets_per_key,
1974  &join_column_per_key,
1975  cpu_thread_idx,
1976  cpu_thread_count] {
1977  const auto key_handler = OverlapsKeyHandler(
1978  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1979  &join_column_per_key[0],
1980  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1981  count_matches_baseline(count_buff,
1982  composite_key_dict,
1983  hash_entry_count,
1984  &key_handler,
1985  join_column_per_key[0].num_elems,
1986  cpu_thread_idx,
1987  cpu_thread_count);
1988  }));
1989  } else {
1990  counter_threads.push_back(
1992  [count_buff,
1993  composite_key_dict,
1994  &key_component_count,
1995  &hash_entry_count,
1996  &join_column_per_key,
1997  &type_info_per_key,
1998  &sd_inner_to_outer_translation_maps,
1999  &sd_min_inner_elems,
2000  cpu_thread_idx,
2001  cpu_thread_count] {
2002  const auto key_handler =
2003  GenericKeyHandler(key_component_count,
2004  true,
2005  &join_column_per_key[0],
2006  &type_info_per_key[0],
2007  &sd_inner_to_outer_translation_maps[0],
2008  &sd_min_inner_elems[0]);
2009  count_matches_baseline(count_buff,
2010  composite_key_dict,
2011  hash_entry_count,
2012  &key_handler,
2013  join_column_per_key[0].num_elems,
2014  cpu_thread_idx,
2015  cpu_thread_count);
2016  }));
2017  }
2018  }
2019 
2020  for (auto& child : counter_threads) {
2021  child.get();
2022  }
2023 
2024  std::vector<int32_t> count_copy(hash_entry_count, 0);
2025  CHECK_GT(hash_entry_count, int64_t(0));
2026  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
2028  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
2029  std::vector<std::future<void>> pos_threads;
2030  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
2031  pos_threads.push_back(std::async(
2033  [&](const int thread_idx) {
2034  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
2035  if (count_buff[i]) {
2036  pos_buff[i] = count_copy[i];
2037  }
2038  }
2039  },
2040  cpu_thread_idx));
2041  }
2042  for (auto& child : pos_threads) {
2043  child.get();
2044  }
2045 
2046  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
2047  std::vector<std::future<void>> rowid_threads;
2048  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
2049  if (is_range_join) {
2050  rowid_threads.push_back(std::async(
2052  [buff,
2053  composite_key_dict,
2054  hash_entry_count,
2055  &join_column_per_key,
2056  &join_buckets_per_key,
2057  &is_geo_compressed,
2058  cpu_thread_idx,
2059  cpu_thread_count] {
2060  const auto key_handler = RangeKeyHandler(
2061  is_geo_compressed,
2062  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2063  &join_column_per_key[0],
2064  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2066  (buff,
2067  composite_key_dict,
2068  hash_entry_count,
2069  &key_handler,
2070  join_column_per_key[0].num_elems,
2071  false,
2072  cpu_thread_idx,
2073  cpu_thread_count);
2074  }));
2075  } else if (join_buckets_per_key.size() > 0) {
2076  rowid_threads.push_back(std::async(
2078  [buff,
2079  composite_key_dict,
2080  hash_entry_count,
2081  &join_column_per_key,
2082  &join_buckets_per_key,
2083  for_window_framing,
2084  cpu_thread_idx,
2085  cpu_thread_count] {
2086  const auto key_handler = OverlapsKeyHandler(
2087  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2088  &join_column_per_key[0],
2089  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2091  (buff,
2092  composite_key_dict,
2093  hash_entry_count,
2094  &key_handler,
2095  join_column_per_key[0].num_elems,
2096  for_window_framing,
2097  cpu_thread_idx,
2098  cpu_thread_count);
2099  }));
2100  } else {
2101  rowid_threads.push_back(std::async(std::launch::async,
2102  [buff,
2103  composite_key_dict,
2104  hash_entry_count,
2105  key_component_count,
2106  &join_column_per_key,
2107  &type_info_per_key,
2108  &sd_inner_to_outer_translation_maps,
2109  &sd_min_inner_elems,
2110  for_window_framing,
2111  cpu_thread_idx,
2112  cpu_thread_count] {
2113  const auto key_handler = GenericKeyHandler(
2114  key_component_count,
2115  true,
2116  &join_column_per_key[0],
2117  &type_info_per_key[0],
2118  &sd_inner_to_outer_translation_maps[0],
2119  &sd_min_inner_elems[0]);
2121  (buff,
2122  composite_key_dict,
2123  hash_entry_count,
2124  &key_handler,
2125  join_column_per_key[0].num_elems,
2126  for_window_framing,
2127  cpu_thread_idx,
2128  cpu_thread_count);
2129  }));
2130  }
2131  }
2132 
2133  for (auto& child : rowid_threads) {
2134  child.get();
2135  }
2136 }
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:305
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() fill_row_ids_baseline(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *f, const int64_t num_elems, const bool for_window_framing, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() count_matches_baseline(int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed,
const bool  for_window_framing 
)

Definition at line 2138 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2151  {
2152  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2153  composite_key_dict,
2154  hash_entry_count,
2155  key_component_count,
2156  join_column_per_key,
2157  type_info_per_key,
2158  join_bucket_info,
2159  sd_inner_to_outer_translation_maps,
2160  sd_min_inner_elems,
2161  cpu_thread_count,
2162  is_range_join,
2163  is_geo_compressed,
2164  for_window_framing);
2165 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed,
const bool  for_window_framing 
)

Definition at line 2167 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2180  {
2181  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2182  composite_key_dict,
2183  hash_entry_count,
2184  key_component_count,
2185  join_column_per_key,
2186  type_info_per_key,
2187  join_bucket_info,
2188  sd_inner_to_outer_translation_maps,
2189  sd_min_inner_elems,
2190  cpu_thread_count,
2191  is_range_join,
2192  is_geo_compressed,
2193  for_window_framing);
2194 }

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count,
const bool  for_window_framing 
)

Definition at line 1490 of file HashJoinRuntime.cpp.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1497  {
1498  auto timer = DEBUG_TIMER(__func__);
1499  auto launch_count_matches =
1500  [count_buff = buff + hash_entry_info.bucketized_hash_entry_count,
1501  &join_column,
1502  &type_info,
1503  sd_inner_to_outer_translation_map,
1504  min_inner_elem](auto cpu_thread_idx, auto cpu_thread_count) {
1506  (count_buff,
1507  join_column,
1508  type_info,
1509  sd_inner_to_outer_translation_map,
1510  min_inner_elem,
1511  cpu_thread_idx,
1512  cpu_thread_count);
1513  };
1514  auto launch_fill_row_ids =
1515  [hash_entry_count = hash_entry_info.bucketized_hash_entry_count,
1516  buff,
1517  &join_column,
1518  &type_info,
1519  sd_inner_to_outer_translation_map,
1520  min_inner_elem,
1521  for_window_framing](auto cpu_thread_idx, auto cpu_thread_count) {
1523  (buff,
1524  hash_entry_count,
1525  join_column,
1526  type_info,
1527  for_window_framing,
1528  sd_inner_to_outer_translation_map,
1529  min_inner_elem,
1530  cpu_thread_idx,
1531  cpu_thread_count);
1532  };
1533 
1535  hash_entry_info.bucketized_hash_entry_count,
1536  join_column,
1537  type_info,
1538  sd_inner_to_outer_translation_map,
1539  min_inner_elem,
1540  cpu_thread_count,
1541  for_window_framing,
1542  launch_count_matches,
1543  launch_fill_row_ids);
1544 }
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define DEBUG_TIMER(name)
Definition: Logger.h:411
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
size_t bucketized_hash_entry_count

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1546 of file HashJoinRuntime.cpp.

References BucketizedHashEntryInfo::bucket_normalization, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), BucketizedHashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1553  {
1554  auto timer = DEBUG_TIMER(__func__);
1555  auto bucket_normalization = hash_entry_info.bucket_normalization;
1556  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1557  auto launch_count_matches = [bucket_normalization,
1558  count_buff = buff + hash_entry_count,
1559  &join_column,
1560  &type_info,
1561  sd_inner_to_outer_translation_map,
1562  min_inner_elem](auto cpu_thread_idx,
1563  auto cpu_thread_count) {
1565  (count_buff,
1566  join_column,
1567  type_info,
1568  sd_inner_to_outer_translation_map,
1569  min_inner_elem,
1570  cpu_thread_idx,
1571  cpu_thread_count,
1572  bucket_normalization);
1573  };
1574  auto launch_fill_row_ids = [bucket_normalization,
1575  hash_entry_count,
1576  buff,
1577  &join_column,
1578  &type_info,
1579  sd_inner_to_outer_translation_map,
1580  min_inner_elem](auto cpu_thread_idx,
1581  auto cpu_thread_count) {
1583  (buff,
1584  hash_entry_count,
1585  join_column,
1586  type_info,
1587  sd_inner_to_outer_translation_map,
1588  min_inner_elem,
1589  cpu_thread_idx,
1590  cpu_thread_count,
1591  bucket_normalization);
1592  };
1593 
1595  hash_entry_count,
1596  join_column,
1597  type_info,
1598  sd_inner_to_outer_translation_map,
1599  min_inner_elem,
1600  cpu_thread_count,
1601  false,
1602  launch_count_matches,
1603  launch_fill_row_ids);
1604 }
#define SUFFIX(name)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const
#define DEBUG_TIMER(name)
Definition: Logger.h:411
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count,
const bool  for_window_framing,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_func,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_func 
)

Definition at line 1428 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, and inclusive_scan().

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_bucketized().

1437  {
1438  auto timer = DEBUG_TIMER(__func__);
1439  int32_t* pos_buff = buff;
1440  int32_t* count_buff = buff + hash_entry_count;
1441  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1442  std::vector<std::future<void>> counter_threads;
1443  for (unsigned cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1444  counter_threads.push_back(std::async(
1445  std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));
1446  }
1447 
1448  for (auto& child : counter_threads) {
1449  child.get();
1450  }
1451 
1452  std::vector<int32_t> count_copy(hash_entry_count, 0);
1453  CHECK_GT(hash_entry_count, int64_t(0));
1454  memcpy(count_copy.data() + 1, count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1455 #if HAVE_CUDA
1456  thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());
1457 #else
1459  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1460 #endif
1461  std::vector<std::future<void>> pos_threads;
1462  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1463  pos_threads.push_back(std::async(
1465  [&](size_t thread_idx) {
1466  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1467  if (count_buff[i]) {
1468  pos_buff[i] = count_copy[i];
1469  }
1470  }
1471  },
1472  cpu_thread_idx));
1473  }
1474  for (auto& child : pos_threads) {
1475  child.get();
1476  }
1477 
1478  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1479  std::vector<std::future<void>> rowid_threads;
1480  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1481  rowid_threads.push_back(std::async(
1482  std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));
1483  }
1484 
1485  for (auto& child : rowid_threads) {
1486  child.get();
1487  }
1488 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:305
future< Result > async(Fn &&fn, Args &&...args)
#define DEBUG_TIMER(name)
Definition: Logger.h:411

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1666 of file HashJoinRuntime.cpp.

References count_matches_sharded(), fill_one_to_many_hash_table_sharded_impl(), fill_row_ids_sharded(), and SUFFIX.

1673  {
1674  auto launch_count_matches = [count_buff = buff + hash_entry_count,
1675  &join_column,
1676  &type_info,
1677  &shard_info
1678 #ifndef __CUDACC__
1679  ,
1680  sd_inner_to_outer_translation_map,
1681  min_inner_elem
1682 #endif
1683  ](auto cpu_thread_idx, auto cpu_thread_count) {
1684  return SUFFIX(count_matches_sharded)(count_buff,
1685  join_column,
1686  type_info,
1687  shard_info
1688 #ifndef __CUDACC__
1689  ,
1690  sd_inner_to_outer_translation_map,
1691  min_inner_elem,
1692  cpu_thread_idx,
1693  cpu_thread_count
1694 #endif
1695  );
1696  };
1697 
1698  auto launch_fill_row_ids = [buff,
1699  hash_entry_count,
1700  &join_column,
1701  &type_info,
1702  &shard_info
1703 #ifndef __CUDACC__
1704  ,
1705  sd_inner_to_outer_translation_map,
1706  min_inner_elem
1707 #endif
1708  ](auto cpu_thread_idx, auto cpu_thread_count) {
1709  return SUFFIX(fill_row_ids_sharded)(buff,
1710  hash_entry_count,
1711  join_column,
1712  type_info,
1713  shard_info
1714 #ifndef __CUDACC__
1715  ,
1716  sd_inner_to_outer_translation_map,
1717  min_inner_elem,
1718  cpu_thread_idx,
1719  cpu_thread_count);
1720 #endif
1721  };
1722 
1724  hash_entry_count,
1725  join_column,
1726  type_info,
1727  shard_info
1728 #ifndef __CUDACC__
1729  ,
1730  sd_inner_to_outer_translation_map,
1731  min_inner_elem,
1732  cpu_thread_count
1733 #endif
1734  ,
1735  launch_count_matches,
1736  launch_fill_row_ids);
1737 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_launcher,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_launcher 
)

Definition at line 1607 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, and inclusive_scan().

Referenced by fill_one_to_many_hash_table_sharded().

1617  {
1618  auto timer = DEBUG_TIMER(__func__);
1619  int32_t* pos_buff = buff;
1620  int32_t* count_buff = buff + hash_entry_count;
1621  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1622  std::vector<std::future<void>> counter_threads;
1623  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1624  counter_threads.push_back(std::async(
1625  std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));
1626  }
1627 
1628  for (auto& child : counter_threads) {
1629  child.get();
1630  }
1631 
1632  std::vector<int32_t> count_copy(hash_entry_count, 0);
1633  CHECK_GT(hash_entry_count, int64_t(0));
1634  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1636  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1637  std::vector<std::future<void>> pos_threads;
1638  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1639  pos_threads.push_back(std::async(
1641  [&](const unsigned thread_idx) {
1642  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1643  if (count_buff[i]) {
1644  pos_buff[i] = count_copy[i];
1645  }
1646  }
1647  },
1648  cpu_thread_idx));
1649  }
1650  for (auto& child : pos_threads) {
1651  child.get();
1652  }
1653 
1654  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1655  std::vector<std::future<void>> rowid_threads;
1656  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1657  rowid_threads.push_back(std::async(
1658  std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));
1659  }
1660 
1661  for (auto& child : rowid_threads) {
1662  child.get();
1663  }
1664 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:305
future< Result > async(Fn &&fn, Args &&...args)
#define DEBUG_TIMER(name)
Definition: Logger.h:411

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const bool  for_window_framing,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 964 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

976  {
977  auto slot_sel = [&type_info](auto pos_buff, auto elem) {
978  return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);
979  };
980 
981  fill_row_ids_impl(buff,
982  hash_entry_count,
983  join_column,
984  type_info,
985  for_window_framing
986 #ifndef __CUDACC__
987  ,
988  sd_inner_to_outer_translation_map,
989  min_inner_elem,
990  cpu_thread_idx,
991  cpu_thread_count
992 #endif
993  ,
994  slot_sel);
995 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76
const int64_t min_val
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const bool  for_window_framing,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1176 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and heavydb.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

1187  {
1188  int32_t* pos_buff = buff;
1189  int32_t* count_buff = buff + hash_entry_count;
1190  int32_t* id_buff = count_buff + hash_entry_count;
1191  int32_t* reversed_id_buff = for_window_framing ? id_buff + num_elems : nullptr;
1192 #ifdef __CUDACC__
1193  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1194  int32_t step = blockDim.x * gridDim.x;
1195 #else
1196  int32_t start = cpu_thread_idx;
1197  int32_t step = cpu_thread_count;
1198 #endif
1199 
1200  T key_scratch_buff[g_maximum_conditions_to_coalesce];
1201 #ifdef __CUDACC__
1202  assert(composite_key_dict);
1203 #endif
1204  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
1205  auto key_buff_handler = [composite_key_dict,
1206  hash_entry_count,
1207  pos_buff,
1208  count_buff,
1209  id_buff,
1210  reversed_id_buff,
1211  key_size_in_bytes,
1212  for_window_framing](const int64_t row_index,
1213  const T* key_scratch_buff,
1214  const size_t key_component_count) {
1215  const T* matching_group =
1217  key_component_count,
1218  composite_key_dict,
1219  hash_entry_count,
1220  key_size_in_bytes);
1221  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
1222  int32_t* pos_ptr = pos_buff + entry_idx;
1223  const auto bin_idx = pos_ptr - pos_buff;
1224  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1225  id_buff[id_buff_idx] = static_cast<int32_t>(row_index);
1226  if (for_window_framing) {
1227  reversed_id_buff[row_index] = id_buff_idx;
1228  }
1229  return 0;
1230  };
1231 
1232  JoinColumnTuple cols(
1233  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1234  for (auto& it : cols.slice(start, step)) {
1235  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1236  }
1237  return;
1238 }
#define SUFFIX(name)
constexpr double f
Definition: Utm.h:31
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 997 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

1010  {
1011  auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {
1012  return SUFFIX(get_bucketized_hash_slot)(pos_buff,
1013  elem,
1014  type_info.min_val / bucket_normalization,
1015  type_info.translated_null_val,
1016  bucket_normalization);
1017  };
1018  fill_row_ids_impl(buff,
1019  hash_entry_count,
1020  join_column,
1021  type_info,
1022  false
1023 #ifndef __CUDACC__
1024  ,
1025  sd_inner_to_outer_translation_map,
1026  min_inner_elem,
1027  cpu_thread_idx,
1028  cpu_thread_count
1029 #endif
1030  ,
1031  slot_sel);
1032 }
#define SUFFIX(name)
const int64_t translated_null_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
const int64_t min_val
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const bool  for_window_framing,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 902 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumn::num_elems, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_row_ids(), fill_row_ids_bucketized(), fill_row_ids_sharded(), and fill_row_ids_sharded_bucketized().

915  {
916  int32_t* pos_buff = buff;
917  int32_t* count_buff = buff + hash_entry_count;
918  int32_t* id_buff = count_buff + hash_entry_count;
919  int32_t* reversed_id_buff =
920  for_window_framing ? id_buff + join_column.num_elems : nullptr;
921 
922 #ifdef __CUDACC__
923  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
924  int32_t step = blockDim.x * gridDim.x;
925 #else
926  int32_t start = cpu_thread_idx;
927  int32_t step = cpu_thread_count;
928 #endif
929  JoinColumnTyped col{&join_column, &type_info};
930  for (auto item : col.slice(start, step)) {
931  const size_t index = item.index;
932  int64_t elem = item.element;
933  if (elem == type_info.null_val) {
934  if (type_info.uses_bw_eq) {
935  elem = type_info.translated_null_val;
936  } else {
937  continue;
938  }
939  }
940 #ifndef __CUDACC__
941  if (sd_inner_to_outer_translation_map &&
942  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
943  const auto outer_id = map_str_id_to_outer_dict(elem,
944  min_inner_elem,
945  type_info.min_val,
946  type_info.max_val,
947  sd_inner_to_outer_translation_map);
948  if (outer_id == StringDictionary::INVALID_STR_ID) {
949  continue;
950  }
951  elem = outer_id;
952  }
953 #endif
954  auto pos_ptr = slot_selector(pos_buff, elem);
955  const auto bin_idx = pos_ptr - pos_buff;
956  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
957  id_buff[id_buff_idx] = static_cast<int32_t>(index);
958  if (for_window_framing) {
959  reversed_id_buff[index] = id_buff_idx;
960  }
961  }
962 }
size_t num_elems
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1093 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot_sharded(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

1105  {
1106  auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {
1107  return SUFFIX(get_hash_slot_sharded)(pos_buff,
1108  elem,
1109  type_info.min_val,
1110  shard_info.entry_count_per_shard,
1111  shard_info.num_shards,
1112  shard_info.device_count);
1113  };
1114 
1115  fill_row_ids_impl(buff,
1116  hash_entry_count,
1117  join_column,
1118  type_info,
1119  false
1120 #ifndef __CUDACC__
1121  ,
1122  sd_inner_to_outer_translation_map,
1123  min_inner_elem,
1124  cpu_thread_idx,
1125  cpu_thread_count
1126 #endif
1127  ,
1128  slot_sel);
1129 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:100
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 1131 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot_sharded(), and SUFFIX.

1145  {
1146  auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,
1147  auto elem) {
1149  pos_buff,
1150  elem,
1151  type_info.min_val / bucket_normalization,
1152  type_info.translated_null_val,
1153  shard_info.entry_count_per_shard,
1154  shard_info.num_shards,
1155  shard_info.device_count,
1156  bucket_normalization);
1157  };
1158 
1159  fill_row_ids_impl(buff,
1160  hash_entry_count,
1161  join_column,
1162  type_info,
1163  false
1164 #ifndef __CUDACC__
1165  ,
1166  sd_inner_to_outer_translation_map,
1167  min_inner_elem,
1168  cpu_thread_idx,
1169  cpu_thread_count
1170 #endif
1171  ,
1172  slot_sel);
1173 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:82

+ Here is the call graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 1035 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

1048  {
1049 
1050  int32_t* pos_buff = buff;
1051  int32_t* count_buff = buff + hash_entry_count;
1052  int32_t* id_buff = count_buff + hash_entry_count;
1053 
1054 #ifdef __CUDACC__
1055  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1056  int32_t step = blockDim.x * gridDim.x;
1057 #else
1058  int32_t start = cpu_thread_idx;
1059  int32_t step = cpu_thread_count;
1060 #endif
1061  JoinColumnTyped col{&join_column, &type_info};
1062  for (auto item : col.slice(start, step)) {
1063  const size_t index = item.index;
1064  int64_t elem = item.element;
1065  if (elem == type_info.null_val) {
1066  if (type_info.uses_bw_eq) {
1067  elem = type_info.translated_null_val;
1068  } else {
1069  continue;
1070  }
1071  }
1072 #ifndef __CUDACC__
1073  if (sd_inner_to_outer_translation_map &&
1074  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
1075  const auto outer_id = map_str_id_to_outer_dict(elem,
1076  min_inner_elem,
1077  type_info.min_val,
1078  type_info.max_val,
1079  sd_inner_to_outer_translation_map);
1080  if (outer_id == StringDictionary::INVALID_STR_ID) {
1081  continue;
1082  }
1083  elem = outer_id;
1084  }
1085 #endif
1086  auto* pos_ptr = slot_selector(pos_buff, elem);
1087  const auto bin_idx = pos_ptr - pos_buff;
1088  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1089  id_buff[id_buff_idx] = static_cast<int32_t>(index);
1090  }
1091 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

template<typename T >
T* get_matching_baseline_hash_slot_at ( int8_t *  hash_buff,
const uint32_t  h,
const T *  key,
const size_t  key_component_count,
const int64_t  hash_entry_size 
)

Definition at line 473 of file HashJoinRuntime.cpp.

References cas_cst, get_invalid_key(), load_cst, store_cst, SUFFIX, heavydb.dtypes::T, and UNLIKELY.

Referenced by write_baseline_hash_slot(), and write_baseline_hash_slot_for_semi_join().

477  {
478  uint32_t off = h * hash_entry_size;
479  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
480  T empty_key = SUFFIX(get_invalid_key)<T>();
481  T write_pending = SUFFIX(get_invalid_key)<T>() - 1;
482  if (UNLIKELY(*key == write_pending)) {
483  // Address the singularity case where the first column contains the pending
484  // write special value. Should never happen, but avoid doing wrong things.
485  return nullptr;
486  }
487  const bool success = cas_cst(row_ptr, &empty_key, write_pending);
488  if (success) {
489  if (key_component_count > 1) {
490  memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));
491  }
492  store_cst(row_ptr, *key);
493  return reinterpret_cast<T*>(row_ptr + key_component_count);
494  }
495  while (load_cst(row_ptr) == write_pending) {
496  // spin until the winning thread has finished writing the entire key
497  }
498  for (size_t i = 0; i < key_component_count; ++i) {
499  if (load_cst(row_ptr + i) != key[i]) {
500  return nullptr;
501  }
502  }
503  return reinterpret_cast<T*>(row_ptr + key_component_count);
504 }
#define SUFFIX(name)
#define load_cst(ptr)
DEVICE T SUFFIX() get_invalid_key()
#define cas_cst(ptr, expected, desired)
#define UNLIKELY(x)
Definition: likely.h:25
#define store_cst(ptr, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE NEVER_INLINE const T* SUFFIX() get_matching_baseline_hash_slot_readonly ( const T *  key,
const size_t  key_component_count,
const T *  composite_key_dict,
const int64_t  entry_count,
const size_t  key_size_in_bytes 
)

Definition at line 826 of file HashJoinRuntime.cpp.

References CHECK, keys_are_equal(), and MurmurHash1Impl().

Referenced by count_matches_baseline(), and fill_row_ids_baseline().

831  {
832  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
833  uint32_t off = h * key_component_count;
834  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
835  return &composite_key_dict[off];
836  }
837  uint32_t h_probe = (h + 1) % entry_count;
838  while (h_probe != h) {
839  off = h_probe * key_component_count;
840  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
841  return &composite_key_dict[off];
842  }
843  h_probe = (h_probe + 1) % entry_count;
844  }
845 #ifndef __CUDACC__
846  CHECK(false);
847 #else
848  assert(false);
849 #endif
850  return nullptr;
851 }
bool keys_are_equal(const T *key1, const T *key2, const size_t key_component_count)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename InputIterator , typename OutputIterator >
void inclusive_scan ( InputIterator  first,
InputIterator  last,
OutputIterator  out,
const size_t  thread_count 
)

Definition at line 1356 of file HashJoinRuntime.cpp.

References threading_serial::async(), and gpu_enabled::partial_sum().

Referenced by approximate_distinct_tuples_on_device_overlaps(), approximate_distinct_tuples_on_device_range(), approximate_distinct_tuples_overlaps(), approximate_distinct_tuples_range(), fill_one_to_many_baseline_hash_table(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_impl(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), fill_one_to_many_hash_table_sharded_impl(), and gpu_enabled::partial_sum().

1359  {
1360  using ElementType = typename InputIterator::value_type;
1361  using OffsetType = typename InputIterator::difference_type;
1362  const OffsetType elem_count = last - first;
1363  if (elem_count < 10000 || thread_count <= 1) {
1364  ElementType sum = 0;
1365  for (auto iter = first; iter != last; ++iter, ++out) {
1366  *out = sum += *iter;
1367  }
1368  return;
1369  }
1370 
1371  const OffsetType step = (elem_count + thread_count - 1) / thread_count;
1372  OffsetType start_off = 0;
1373  OffsetType end_off = std::min(step, elem_count);
1374  std::vector<ElementType> partial_sums(thread_count);
1375  std::vector<std::future<void>> counter_threads;
1376  for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,
1377  start_off = std::min(start_off + step, elem_count),
1378  end_off = std::min(start_off + step, elem_count)) {
1379  counter_threads.push_back(std::async(
1381  [first, out](
1382  ElementType& partial_sum, const OffsetType start, const OffsetType end) {
1383  ElementType sum = 0;
1384  for (auto in_iter = first + start, out_iter = out + start;
1385  in_iter != (first + end);
1386  ++in_iter, ++out_iter) {
1387  *out_iter = sum += *in_iter;
1388  }
1389  partial_sum = sum;
1390  },
1391  std::ref(partial_sums[thread_idx]),
1392  start_off,
1393  end_off));
1394  }
1395  for (auto& child : counter_threads) {
1396  child.get();
1397  }
1398 
1399  ElementType sum = 0;
1400  for (auto& s : partial_sums) {
1401  s += sum;
1402  sum = s;
1403  }
1404 
1405  counter_threads.clear();
1406  start_off = std::min(step, elem_count);
1407  end_off = std::min(start_off + step, elem_count);
1408  for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,
1409  start_off = std::min(start_off + step, elem_count),
1410  end_off = std::min(start_off + step, elem_count)) {
1411  counter_threads.push_back(std::async(
1413  [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {
1414  for (auto iter = out + start; iter != (out + end); ++iter) {
1415  *iter += prev_sum;
1416  }
1417  },
1418  partial_sums[thread_idx],
1419  start_off,
1420  end_off));
1421  }
1422  for (auto& child : counter_threads) {
1423  child.get();
1424  }
1425 }
future< Result > async(Fn &&fn, Args &&...args)
DEVICE void partial_sum(ARGS &&...args)
Definition: gpu_enabled.h:87

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 354 of file HashJoinRuntime.cpp.

References get_invalid_key(), SUFFIX, and heavydb.dtypes::T.

Referenced by init_baseline_hash_join_buff_wrapper().

360  {
361 #ifdef __CUDACC__
362  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
363  int32_t step = blockDim.x * gridDim.x;
364 #else
365  int32_t start = cpu_thread_idx;
366  int32_t step = cpu_thread_count;
367 #endif
368  auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
369  const T empty_key = SUFFIX(get_invalid_key)<T>();
370  for (int64_t h = start; h < entry_count; h += step) {
371  int64_t off = h * hash_entry_size;
372  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
373  for (size_t i = 0; i < key_component_count; ++i) {
374  row_ptr[i] = empty_key;
375  }
376  if (with_val_slot) {
377  row_ptr[key_component_count] = invalid_slot_val;
378  }
379  }
380 }
#define SUFFIX(name)
DEVICE T SUFFIX() get_invalid_key()

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1739 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1745  {
1746  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1747  entry_count,
1748  key_component_count,
1749  with_val_slot,
1750  invalid_slot_val,
1751  cpu_thread_idx,
1752  cpu_thread_count);
1753 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1755 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1761  {
1762  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1763  entry_count,
1764  key_component_count,
1765  with_val_slot,
1766  invalid_slot_val,
1767  cpu_thread_idx,
1768  cpu_thread_count);
1769 }

+ Here is the caller graph for this function:

DEVICE void SUFFIX() init_hash_join_buff ( int32_t *  groups_buffer,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 70 of file HashJoinRuntime.cpp.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

74  {
75 #ifdef __CUDACC__
76  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
77  int32_t step = blockDim.x * gridDim.x;
78 #else
79  int32_t start = cpu_thread_idx;
80  int32_t step = cpu_thread_count;
81 #endif
82  for (int64_t i = start; i < hash_entry_count; i += step) {
83  groups_buffer[i] = invalid_slot_val;
84  }
85 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1817 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1825  {
1826  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1827  entry_count,
1828  invalid_slot_val,
1829  false,
1830  key_component_count,
1831  with_val_slot,
1832  key_handler,
1833  num_elems,
1834  cpu_thread_idx,
1835  cpu_thread_count);
1836 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1881 of file HashJoinRuntime.cpp.

1889  {
1890  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1891  entry_count,
1892  invalid_slot_val,
1893  false,
1894  key_component_count,
1895  with_val_slot,
1896  key_handler,
1897  num_elems,
1898  cpu_thread_idx,
1899  cpu_thread_count);
1900 }
int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1838 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1846  {
1847  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1848  entry_count,
1849  invalid_slot_val,
1850  false,
1851  key_component_count,
1852  with_val_slot,
1853  key_handler,
1854  num_elems,
1855  cpu_thread_idx,
1856  cpu_thread_count);
1857 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1902 of file HashJoinRuntime.cpp.

1910  {
1911  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1912  entry_count,
1913  invalid_slot_val,
1914  false,
1915  key_component_count,
1916  with_val_slot,
1917  key_handler,
1918  num_elems,
1919  cpu_thread_idx,
1920  cpu_thread_count);
1921 }
template<typename T >
DEVICE int write_baseline_hash_slot ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 513 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and heavydb.dtypes::T.

521  {
522  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
523  T* matching_group = get_matching_baseline_hash_slot_at(
524  hash_buff, h, key, key_component_count, hash_entry_size);
525  if (!matching_group) {
526  uint32_t h_probe = (h + 1) % entry_count;
527  while (h_probe != h) {
528  matching_group = get_matching_baseline_hash_slot_at(
529  hash_buff, h_probe, key, key_component_count, hash_entry_size);
530  if (matching_group) {
531  break;
532  }
533  h_probe = (h_probe + 1) % entry_count;
534  }
535  }
536  if (!matching_group) {
537  return -2;
538  }
539  if (!with_val_slot) {
540  return 0;
541  }
542  if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {
543  return -1;
544  }
545  return 0;
546 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function:

template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 549 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and heavydb.dtypes::T.

557  {
558  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
559  T* matching_group = get_matching_baseline_hash_slot_at(
560  hash_buff, h, key, key_component_count, hash_entry_size);
561  if (!matching_group) {
562  uint32_t h_probe = (h + 1) % entry_count;
563  while (h_probe != h) {
564  matching_group = get_matching_baseline_hash_slot_at(
565  hash_buff, h_probe, key, key_component_count, hash_entry_size);
566  if (matching_group) {
567  break;
568  }
569  h_probe = (h_probe + 1) % entry_count;
570  }
571  }
572  if (!matching_group) {
573  return -2;
574  }
575  if (!with_val_slot) {
576  return 0;
577  }
578  mapd_cas(matching_group, invalid_slot_val, val);
579  return 0;
580 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function: