OmniSciDB  f17484ade4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.cpp File Reference
+ Include dependency graph for HashJoinRuntime.cpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

 anonymous_namespace{HashJoinRuntime.cpp}
 

Macros

#define mapd_cas(address, compare, val)   __sync_val_compare_and_swap(address, compare, val)
 
#define cas_cst(ptr, expected, desired)
 
#define store_cst(ptr, val)   __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)
 
#define load_cst(ptr)   __atomic_load_n(ptr, __ATOMIC_SEQ_CST)
 
#define mapd_add(address, val)   __sync_fetch_and_add(address, val)
 

Functions

int64_t anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict (const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
 
DEVICE void SUFFIX() init_hash_join_buff (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl (OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
 
DEVICE int SUFFIX() fill_hash_join_buff_bitwise_eq (OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
 
DEVICE int SUFFIX() fill_hash_join_buff (OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl (int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
T * get_matching_baseline_hash_slot_at (int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() count_matches (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() count_matches_bucketized (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
GLOBAL void SUFFIX() count_matches_sharded (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE NEVER_INLINE const T
*SUFFIX() 
get_matching_baseline_hash_slot_readonly (const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline (int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_for_window_framing_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_bucketized (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids_sharded (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *f, const int64_t num_elems, const bool for_window_framing, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl (uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl (double *bucket_sizes_for_thread, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename InputIterator , typename OutputIterator >
void inclusive_scan (InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table (OneToManyPerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (OneToManyPerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
 
void fill_one_to_many_hash_table_sharded (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int bbox_intersect_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int bbox_intersect_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
void fill_one_to_many_baseline_hash_table (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const size_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_bbox_intersect (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 

Macro Definition Documentation

#define cas_cst (   ptr,
  expected,
  desired 
)
Value:
__atomic_compare_exchange_n( \
ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

Definition at line 457 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define load_cst (   ptr)    __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

Definition at line 461 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define mapd_add (   address,
  val 
)    __sync_fetch_and_add(address, val)
#define mapd_cas (   address,
  compare,
  val 
)    __sync_val_compare_and_swap(address, compare, val)
#define store_cst (   ptr,
  val 
)    __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

Definition at line 460 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2246 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2251  {
2252  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2253  CHECK(!join_column_per_key.empty());
2254 
2255  std::vector<std::future<void>> approx_distinct_threads;
2256  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2257  approx_distinct_threads.push_back(std::async(
2259  [&join_column_per_key,
2260  &type_info_per_key,
2261  b,
2262  hll_buffer_all_cpus,
2263  padded_size_bytes,
2264  thread_idx,
2265  thread_count] {
2266  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2267 
2268  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2269  false,
2270  &join_column_per_key[0],
2271  &type_info_per_key[0],
2272  nullptr,
2273  nullptr);
2275  nullptr,
2276  b,
2277  join_column_per_key[0].num_elems,
2278  &key_handler,
2279  thread_idx,
2280  thread_count);
2281  }));
2282  }
2283  for (auto& child : approx_distinct_threads) {
2284  child.get();
2285  }
2286 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_bbox_intersect ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2288 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by BoundingBoxIntersectJoinHashTable::approximateTupleCount().

2296  {
2297  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2298  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2299  CHECK(!join_column_per_key.empty());
2300 
2301  std::vector<std::future<void>> approx_distinct_threads;
2302  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2303  approx_distinct_threads.push_back(std::async(
2305  [&join_column_per_key,
2306  &join_buckets_per_key,
2307  &row_counts,
2308  b,
2309  hll_buffer_all_cpus,
2310  padded_size_bytes,
2311  thread_idx,
2312  thread_count] {
2313  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2314 
2315  const auto key_handler = BoundingBoxIntersectKeyHandler(
2316  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2317  &join_column_per_key[0],
2318  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2320  row_counts.data(),
2321  b,
2322  join_column_per_key[0].num_elems,
2323  &key_handler,
2324  thread_idx,
2325  thread_count);
2326  }));
2327  }
2328  for (auto& child : approx_distinct_threads) {
2329  child.get();
2330  }
2331 
2333  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2334 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl ( uint8_t *  hll_buffer,
int32_t *  row_count_buffer,
const uint32_t  b,
const int64_t  num_elems,
const KEY_HANDLER *  f,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1317 of file HashJoinRuntime.cpp.

References atomicMax(), g_maximum_conditions_to_coalesce, get_rank(), and MurmurHash64AImpl().

Referenced by approximate_distinct_tuples(), approximate_distinct_tuples_bbox_intersect(), and approximate_distinct_tuples_range().

1327  {
1328 #ifdef __CUDACC__
1329  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1330  int32_t step = blockDim.x * gridDim.x;
1331 #else
1332  int32_t start = cpu_thread_idx;
1333  int32_t step = cpu_thread_count;
1334 #endif
1335 
1336  auto key_buff_handler = [b, hll_buffer, row_count_buffer](
1337  const int64_t entry_idx,
1338  const int64_t* key_scratch_buff,
1339  const size_t key_component_count) {
1340  if (row_count_buffer) {
1341  row_count_buffer[entry_idx] += 1;
1342  }
1343 
1344  const uint64_t hash =
1345  MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);
1346  const uint32_t index = hash >> (64 - b);
1347  const auto rank = get_rank(hash << b, 64 - b);
1348 #ifdef __CUDACC__
1349  atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);
1350 #else
1351  hll_buffer[index] = std::max(hll_buffer[index], rank);
1352 #endif
1353 
1354  return 0;
1355  };
1356 
1357  int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];
1358 
1359  JoinColumnTuple cols(
1360  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1361  for (auto& it : cols.slice(start, step)) {
1362  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1363  }
1364 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
__device__ double atomicMax(double *address, double val)
const size_t g_maximum_conditions_to_coalesce
FORCE_INLINE DEVICE uint64_t MurmurHash64AImpl(const void *key, int len, uint64_t seed)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2336 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2345  {
2346  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2347  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2348  CHECK(!join_column_per_key.empty());
2349 
2350  std::vector<std::future<void>> approx_distinct_threads;
2351  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2352  approx_distinct_threads.push_back(std::async(
2354  [&join_column_per_key,
2355  &join_buckets_per_key,
2356  &row_counts,
2357  b,
2358  hll_buffer_all_cpus,
2359  padded_size_bytes,
2360  thread_idx,
2361  is_compressed,
2362  thread_count] {
2363  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2364 
2365  const auto key_handler = RangeKeyHandler(
2366  is_compressed,
2367  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2368  &join_column_per_key[0],
2369  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2371  row_counts.data(),
2372  b,
2373  join_column_per_key[0].num_elems,
2374  &key_handler,
2375  thread_idx,
2376  thread_count);
2377  }));
2378  }
2379  for (auto& child : approx_distinct_threads) {
2380  child.get();
2381  }
2382 
2384  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2385 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int bbox_intersect_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1865 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1874  {
1875  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1876  entry_count,
1877  invalid_slot_val,
1878  false,
1879  key_component_count,
1880  with_val_slot,
1881  key_handler,
1882  num_elems,
1883  cpu_thread_idx,
1884  cpu_thread_count);
1885 }

+ Here is the caller graph for this function:

int bbox_intersect_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1930 of file HashJoinRuntime.cpp.

1939  {
1940  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1941  entry_count,
1942  invalid_slot_val,
1943  false,
1944  key_component_count,
1945  with_val_slot,
1946  key_handler,
1947  num_elems,
1948  cpu_thread_idx,
1949  cpu_thread_count);
1950 }
template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl ( double *  bucket_sizes_for_thread,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1387 of file HashJoinRuntime.cpp.

References atomicMin(), fixed_width_double_decode_noinline(), anonymous_namespace{Utm.h}::N, JoinColumnIterator::ptr(), and SUFFIX.

1396  {
1397 #ifdef __CUDACC__
1398  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1399  int32_t step = blockDim.x * gridDim.x;
1400 #else
1401  int32_t start = cpu_thread_idx;
1402  int32_t step = cpu_thread_count;
1403 #endif
1404  JoinColumnIterator it(join_column, type_info, start, step);
1405  for (; it; ++it) {
1406  // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)
1407  double bounds[2 * N];
1408  for (size_t j = 0; j < 2 * N; j++) {
1409  bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);
1410  }
1411 
1412  for (size_t j = 0; j < N; j++) {
1413  const auto diff = bounds[j + N] - bounds[j];
1414 #ifdef __CUDACC__
1415  if (diff > bucket_size_thresholds[j]) {
1416  atomicMin(&bucket_sizes_for_thread[j], diff);
1417  }
1418 #else
1419  if (diff < bucket_size_thresholds[j] && diff > bucket_sizes_for_thread[j]) {
1420  bucket_sizes_for_thread[j] = diff;
1421  }
1422 #endif
1423  }
1424  }
1425 }
__device__ double atomicMin(double *address, double val)
#define SUFFIX(name)
Iterates over the rows of a JoinColumn across multiple fragments/chunks.
constexpr unsigned N
Definition: Utm.h:110
DEVICE NEVER_INLINE double SUFFIX() fixed_width_double_decode_noinline(const int8_t *byte_stream, const int64_t pos)
Definition: DecodersImpl.h:134

+ Here is the call graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2387 of file HashJoinRuntime.cpp.

References threading_serial::async().

Referenced by anonymous_namespace{BoundingBoxIntersectJoinHashTable.cpp}::compute_bucket_sizes().

2391  {
2392  std::vector<std::vector<double>> bucket_sizes_for_threads;
2393  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2394  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2395  }
2396  std::vector<std::future<void>> threads;
2397  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2398  threads.push_back(std::async(std::launch::async,
2399  compute_bucket_sizes_impl<2>,
2400  bucket_sizes_for_threads[thread_idx].data(),
2401  &join_column,
2402  &type_info,
2403  bucket_size_thresholds.data(),
2404  thread_idx,
2405  thread_count));
2406  }
2407  for (auto& child : threads) {
2408  child.get();
2409  }
2410 
2411  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2412  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2413  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2414  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2415  }
2416  }
2417  }
2418 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 701 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

711  {
712  auto slot_sel = [&type_info](auto count_buff, auto elem) {
713  return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);
714  };
715  count_matches_impl(count_buff,
716  join_column,
717  type_info
718 #ifndef __CUDACC__
719  ,
720  sd_inner_to_outer_translation_map,
721  min_inner_elem,
722  cpu_thread_idx,
723  cpu_thread_count
724 #endif
725  ,
726  slot_sel);
727 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline ( int32_t *  count_buff,
const T *  composite_key_dict,
const int64_t  entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 846 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and heavydb.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

856  {
857 #ifdef __CUDACC__
858  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
859  int32_t step = blockDim.x * gridDim.x;
860 #else
861  int32_t start = cpu_thread_idx;
862  int32_t step = cpu_thread_count;
863 #endif
864 #ifdef __CUDACC__
865  assert(composite_key_dict);
866 #endif
867  T key_scratch_buff[g_maximum_conditions_to_coalesce];
868  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
869  auto key_buff_handler = [composite_key_dict,
870  entry_count,
871  count_buff,
872  key_size_in_bytes](const int64_t row_entry_idx,
873  const T* key_scratch_buff,
874  const size_t key_component_count) {
875  const auto matching_group =
877  key_component_count,
878  composite_key_dict,
879  entry_count,
880  key_size_in_bytes);
881  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
882  mapd_add(&count_buff[entry_idx], int32_t(1));
883  return 0;
884  };
885 
886  JoinColumnTuple cols(
887  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
888  for (auto& it : cols.slice(start, step)) {
889  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
890  }
891 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_bucketized ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 729 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

741  {
742  auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {
743  return SUFFIX(get_bucketized_hash_slot)(count_buff,
744  elem,
745  type_info.min_val / bucket_normalization,
746  type_info.translated_null_val,
747  bucket_normalization);
748  };
749  count_matches_impl(count_buff,
750  join_column,
751  type_info
752 #ifndef __CUDACC__
753  ,
754  sd_inner_to_outer_translation_map,
755  min_inner_elem,
756  cpu_thread_idx,
757  cpu_thread_count
758 #endif
759  ,
760  slot_sel);
761 }
#define SUFFIX(name)
const int64_t translated_null_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 653 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by count_matches(), and count_matches_bucketized().

664  {
665 #ifdef __CUDACC__
666  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
667  int32_t step = blockDim.x * gridDim.x;
668 #else
669  int32_t start = cpu_thread_idx;
670  int32_t step = cpu_thread_count;
671 #endif
672  JoinColumnTyped col{&join_column, &type_info};
673  for (auto item : col.slice(start, step)) {
674  int64_t elem = item.element;
675  if (elem == type_info.null_val) {
676  if (type_info.uses_bw_eq) {
677  elem = type_info.translated_null_val;
678  } else {
679  continue;
680  }
681  }
682 #ifndef __CUDACC__
683  if (sd_inner_to_outer_translation_map &&
684  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
685  const auto outer_id = map_str_id_to_outer_dict(elem,
686  min_inner_elem,
687  type_info.min_val,
688  type_info.max_val,
689  sd_inner_to_outer_translation_map);
690  if (outer_id == StringDictionary::INVALID_STR_ID) {
691  continue;
692  }
693  elem = outer_id;
694  }
695 #endif
696  auto* entry_ptr = slot_selector(count_buff, elem);
697  mapd_add(entry_ptr, int32_t(1));
698  }
699 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_sharded ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 763 of file HashJoinRuntime.cpp.

References get_hash_slot_sharded(), StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

775  {
776 #ifdef __CUDACC__
777  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
778  int32_t step = blockDim.x * gridDim.x;
779 #else
780  int32_t start = cpu_thread_idx;
781  int32_t step = cpu_thread_count;
782 #endif
783  JoinColumnTyped col{&join_column, &type_info};
784  for (auto item : col.slice(start, step)) {
785  int64_t elem = item.element;
786  if (elem == type_info.null_val) {
787  if (type_info.uses_bw_eq) {
788  elem = type_info.translated_null_val;
789  } else {
790  continue;
791  }
792  }
793 #ifndef __CUDACC__
794  if (sd_inner_to_outer_translation_map &&
795  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
796  const auto outer_id = map_str_id_to_outer_dict(elem,
797  min_inner_elem,
798  type_info.min_val,
799  type_info.max_val,
800  sd_inner_to_outer_translation_map);
801  if (outer_id == StringDictionary::INVALID_STR_ID) {
802  continue;
803  }
804  elem = outer_id;
805  }
806 #endif
807  int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,
808  elem,
809  type_info.min_val,
810  shard_info.entry_count_per_shard,
811  shard_info.num_shards,
812  shard_info.device_count);
813  mapd_add(entry_ptr, int32_t(1));
814  }
815 }
#define SUFFIX(name)
const int64_t null_val
const int64_t translated_null_val
size_t num_shards
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
size_t entry_count_per_shard
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:108
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const FILL_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 575 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, and heavydb.dtypes::T.

584  {
585 #ifdef __CUDACC__
586  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
587  int32_t step = blockDim.x * gridDim.x;
588 #else
589  int32_t start = cpu_thread_idx;
590  int32_t step = cpu_thread_count;
591 #endif
592 
593  T key_scratch_buff[g_maximum_conditions_to_coalesce];
594  const size_t key_size_in_bytes = key_component_count * sizeof(T);
595  const size_t hash_entry_size =
596  (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
597  auto key_buff_handler = [hash_buff,
598  entry_count,
599  with_val_slot,
600  invalid_slot_val,
601  key_size_in_bytes,
602  hash_entry_size,
603  &for_semi_join](const int64_t entry_idx,
604  const T* key_scratch_buffer,
605  const size_t key_component_count) {
606  if (for_semi_join) {
607  return write_baseline_hash_slot_for_semi_join<T>(entry_idx,
608  hash_buff,
609  entry_count,
610  key_scratch_buffer,
611  key_component_count,
612  with_val_slot,
613  invalid_slot_val,
614  key_size_in_bytes,
615  hash_entry_size);
616  } else {
617  return write_baseline_hash_slot<T>(entry_idx,
618  hash_buff,
619  entry_count,
620  key_scratch_buffer,
621  key_component_count,
622  with_val_slot,
623  invalid_slot_val,
624  key_size_in_bytes,
625  hash_entry_size);
626  }
627  };
628 
629  JoinColumnTuple cols(
630  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
631  for (auto& it : cols.slice(start, step)) {
632  const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
633  if (err) {
634  return err;
635  }
636  }
637  return 0;
638 }
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
const size_t g_maximum_conditions_to_coalesce
int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1843 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1852  {
1853  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1854  entry_count,
1855  invalid_slot_val,
1856  for_semi_join,
1857  key_component_count,
1858  with_val_slot,
1859  key_handler,
1860  num_elems,
1861  cpu_thread_idx,
1862  cpu_thread_count);
1863 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1908 of file HashJoinRuntime.cpp.

1917  {
1918  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1919  entry_count,
1920  invalid_slot_val,
1921  for_semi_join,
1922  key_component_count,
1923  with_val_slot,
1924  key_handler,
1925  num_elems,
1926  cpu_thread_idx,
1927  cpu_thread_count);
1928 }
DEVICE int SUFFIX() fill_hash_join_buff ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 202 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

205  {
206  auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
208  auto hashtable_filling_func = [&](auto elem, size_t index) {
209  auto entry_ptr = SUFFIX(get_hash_slot)(args.buff, elem, args.type_info.min_val);
210  return filling_func(index, entry_ptr, args.invalid_slot_val);
211  };
212 
214  args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
215 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bitwise_eq ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
int32_t const  cpu_thread_idx,
int32_t const  cpu_thread_count 
)

Definition at line 186 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_bitwise_eq(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

189  {
190  auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
192  auto hashtable_filling_func = [&](auto elem, size_t index) {
193  auto entry_ptr = SUFFIX(get_hash_slot_bitwise_eq)(
194  args.buff, elem, args.type_info.min_val, args.type_info.translated_null_val);
195  return filling_func(index, entry_ptr, args.invalid_slot_val);
196  };
197 
199  args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
200 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_bitwise_eq(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val)
Definition: JoinHashImpl.h:82
DEVICE auto fill_hash_join_buff_impl(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bucketized ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
int32_t const  cpu_thread_idx,
int32_t const  cpu_thread_count 
)

Definition at line 166 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

169  {
170  auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
172  auto hashtable_filling_func = [&](auto elem, size_t index) {
173  auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(
174  args.buff,
175  elem,
176  args.type_info.min_val / args.bucket_normalization,
177  args.type_info.translated_null_val,
178  args.bucket_normalization);
179  return filling_func(index, entry_ptr, args.invalid_slot_val);
180  };
181 
183  args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);
184 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 119 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, OneToOnePerfectJoinHashTableFillFuncArgs::join_column, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), OneToOnePerfectJoinHashTableFillFuncArgs::min_inner_elem, OneToOnePerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map, and OneToOnePerfectJoinHashTableFillFuncArgs::type_info.

Referenced by fill_hash_join_buff(), fill_hash_join_buff_bitwise_eq(), and fill_hash_join_buff_bucketized().

122  {
123 #ifdef __CUDACC__
124  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
125  int32_t step = blockDim.x * gridDim.x;
126 #else
127  int32_t start = cpu_thread_idx;
128  int32_t step = cpu_thread_count;
129 #endif
130  auto const join_column = args.join_column;
131  auto const type_info = args.type_info;
132  JoinColumnTyped col{&join_column, &type_info};
133  for (auto item : col.slice(start, step)) {
134  const size_t index = item.index;
135  int64_t elem = item.element;
136  if (elem == type_info.null_val) {
137  if (type_info.uses_bw_eq) {
138  elem = type_info.translated_null_val;
139  } else {
140  continue;
141  }
142  }
143 #ifndef __CUDACC__
144  auto const sd_inner_to_outer_translation_map = args.sd_inner_to_outer_translation_map;
145  auto const min_inner_elem = args.min_inner_elem;
146  if (sd_inner_to_outer_translation_map &&
147  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
148  const auto outer_id = map_str_id_to_outer_dict(elem,
149  min_inner_elem,
150  type_info.min_val,
151  type_info.max_val,
152  sd_inner_to_outer_translation_map);
153  if (outer_id == StringDictionary::INVALID_STR_ID) {
154  continue;
155  }
156  elem = outer_id;
157  }
158 #endif
159  if (filling_func(elem, index)) {
160  return -1;
161  }
162  }
163  return 0;
164 };
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 310 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded().

320  {
321  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
322  : SUFFIX(fill_one_to_one_hashtable);
323  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
324  auto entry_ptr = SUFFIX(get_hash_slot_sharded_opt)(buff,
325  elem,
326  type_info.min_val,
327  shard_info.entry_count_per_shard,
328  shard,
329  shard_info.num_shards,
330  shard_info.device_count);
331  return filling_func(index, entry_ptr, invalid_slot_val);
332  };
333 
335  join_column,
336  type_info,
337  shard_info,
338  sd_inner_to_outer_translation_map,
339  min_inner_elem,
340  cpu_thread_idx,
341  cpu_thread_count,
342  hashtable_filling_func);
343 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
size_t num_shards
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:140
size_t entry_count_per_shard
const int64_t min_val
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 271 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded_bucketized().

282  {
283  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
284  : SUFFIX(fill_one_to_one_hashtable);
285  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
287  buff,
288  elem,
289  type_info.min_val / bucket_normalization,
290  type_info.translated_null_val,
291  shard_info.entry_count_per_shard,
292  shard,
293  shard_info.num_shards,
294  shard_info.device_count,
295  bucket_normalization);
296  return filling_func(index, entry_ptr, invalid_slot_val);
297  };
298 
300  join_column,
301  type_info,
302  shard_info,
303  sd_inner_to_outer_translation_map,
304  min_inner_elem,
305  cpu_thread_idx,
306  cpu_thread_count,
307  hashtable_filling_func);
308 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
const int64_t translated_null_val
size_t num_shards
size_t entry_count_per_shard
const int64_t min_val
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:122

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl ( int32_t *  buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 218 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, ShardInfo::num_shards, ShardInfo::shard, SHARD_FOR_KEY, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_sharded(), and fill_hash_join_buff_sharded_bucketized().

227  {
228 #ifdef __CUDACC__
229  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
230  int32_t step = blockDim.x * gridDim.x;
231 #else
232  int32_t start = cpu_thread_idx;
233  int32_t step = cpu_thread_count;
234 #endif
235  JoinColumnTyped col{&join_column, &type_info};
236  for (auto item : col.slice(start, step)) {
237  const size_t index = item.index;
238  int64_t elem = item.element;
239  size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);
240  if (shard != shard_info.shard) {
241  continue;
242  }
243  if (elem == type_info.null_val) {
244  if (type_info.uses_bw_eq) {
245  elem = type_info.translated_null_val;
246  } else {
247  continue;
248  }
249  }
250 #ifndef __CUDACC__
251  if (sd_inner_to_outer_translation_map &&
252  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
253  const auto outer_id = map_str_id_to_outer_dict(elem,
254  min_inner_elem,
255  type_info.min_val,
256  type_info.max_val,
257  sd_inner_to_outer_translation_map);
258  if (outer_id == StringDictionary::INVALID_STR_ID) {
259  continue;
260  }
261  elem = outer_id;
262  }
263 #endif
264  if (filling_func(elem, shard, index)) {
265  return -1;
266  }
267  }
268  return 0;
269 }
const int64_t null_val
const int64_t translated_null_val
size_t num_shards
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define SHARD_FOR_KEY(key, num_shards)
Definition: shard_key.h:20

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
void fill_one_to_many_baseline_hash_table ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const size_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed,
const bool  for_window_framing 
)

Definition at line 1974 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, count_matches_baseline(), fill_row_ids_baseline(), inclusive_scan(), and SUFFIX.

1987  {
1988  int32_t* pos_buff = buff;
1989  int32_t* count_buff = buff + hash_entry_count;
1990  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1991  std::vector<std::future<void>> counter_threads;
1992  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1993  if (is_range_join) {
1994  counter_threads.push_back(std::async(
1996  [count_buff,
1997  composite_key_dict,
1998  &hash_entry_count,
1999  &join_buckets_per_key,
2000  &join_column_per_key,
2001  &is_geo_compressed,
2002  cpu_thread_idx,
2003  cpu_thread_count] {
2004  const auto key_handler = RangeKeyHandler(
2005  is_geo_compressed,
2006  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2007  &join_column_per_key[0],
2008  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2009  count_matches_baseline(count_buff,
2010  composite_key_dict,
2011  hash_entry_count,
2012  &key_handler,
2013  join_column_per_key[0].num_elems,
2014  cpu_thread_idx,
2015  cpu_thread_count);
2016  }));
2017  } else if (join_buckets_per_key.size() > 0) {
2018  counter_threads.push_back(std::async(
2020  [count_buff,
2021  composite_key_dict,
2022  &hash_entry_count,
2023  &join_buckets_per_key,
2024  &join_column_per_key,
2025  cpu_thread_idx,
2026  cpu_thread_count] {
2027  const auto key_handler = BoundingBoxIntersectKeyHandler(
2028  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2029  &join_column_per_key[0],
2030  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2031  count_matches_baseline(count_buff,
2032  composite_key_dict,
2033  hash_entry_count,
2034  &key_handler,
2035  join_column_per_key[0].num_elems,
2036  cpu_thread_idx,
2037  cpu_thread_count);
2038  }));
2039  } else {
2040  counter_threads.push_back(
2042  [count_buff,
2043  composite_key_dict,
2044  &key_component_count,
2045  &hash_entry_count,
2046  &join_column_per_key,
2047  &type_info_per_key,
2048  &sd_inner_to_outer_translation_maps,
2049  &sd_min_inner_elems,
2050  cpu_thread_idx,
2051  cpu_thread_count] {
2052  const auto key_handler =
2053  GenericKeyHandler(key_component_count,
2054  true,
2055  &join_column_per_key[0],
2056  &type_info_per_key[0],
2057  &sd_inner_to_outer_translation_maps[0],
2058  &sd_min_inner_elems[0]);
2059  count_matches_baseline(count_buff,
2060  composite_key_dict,
2061  hash_entry_count,
2062  &key_handler,
2063  join_column_per_key[0].num_elems,
2064  cpu_thread_idx,
2065  cpu_thread_count);
2066  }));
2067  }
2068  }
2069 
2070  for (auto& child : counter_threads) {
2071  child.get();
2072  }
2073 
2074  std::vector<int32_t> count_copy(hash_entry_count, 0);
2075  CHECK_GT(hash_entry_count, int64_t(0));
2076  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
2078  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
2079  std::vector<std::future<void>> pos_threads;
2080  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
2081  pos_threads.push_back(std::async(
2083  [&](const int thread_idx) {
2084  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
2085  if (count_buff[i]) {
2086  pos_buff[i] = count_copy[i];
2087  }
2088  }
2089  },
2090  cpu_thread_idx));
2091  }
2092  for (auto& child : pos_threads) {
2093  child.get();
2094  }
2095 
2096  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
2097  std::vector<std::future<void>> rowid_threads;
2098  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
2099  if (is_range_join) {
2100  rowid_threads.push_back(std::async(
2102  [buff,
2103  composite_key_dict,
2104  hash_entry_count,
2105  &join_column_per_key,
2106  &join_buckets_per_key,
2107  &is_geo_compressed,
2108  cpu_thread_idx,
2109  cpu_thread_count] {
2110  const auto key_handler = RangeKeyHandler(
2111  is_geo_compressed,
2112  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2113  &join_column_per_key[0],
2114  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2116  (buff,
2117  composite_key_dict,
2118  hash_entry_count,
2119  &key_handler,
2120  join_column_per_key[0].num_elems,
2121  false,
2122  cpu_thread_idx,
2123  cpu_thread_count);
2124  }));
2125  } else if (join_buckets_per_key.size() > 0) {
2126  rowid_threads.push_back(std::async(
2128  [buff,
2129  composite_key_dict,
2130  hash_entry_count,
2131  &join_column_per_key,
2132  &join_buckets_per_key,
2133  for_window_framing,
2134  cpu_thread_idx,
2135  cpu_thread_count] {
2136  const auto key_handler = BoundingBoxIntersectKeyHandler(
2137  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2138  &join_column_per_key[0],
2139  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2141  (buff,
2142  composite_key_dict,
2143  hash_entry_count,
2144  &key_handler,
2145  join_column_per_key[0].num_elems,
2146  for_window_framing,
2147  cpu_thread_idx,
2148  cpu_thread_count);
2149  }));
2150  } else {
2151  rowid_threads.push_back(std::async(std::launch::async,
2152  [buff,
2153  composite_key_dict,
2154  hash_entry_count,
2155  key_component_count,
2156  &join_column_per_key,
2157  &type_info_per_key,
2158  &sd_inner_to_outer_translation_maps,
2159  &sd_min_inner_elems,
2160  for_window_framing,
2161  cpu_thread_idx,
2162  cpu_thread_count] {
2163  const auto key_handler = GenericKeyHandler(
2164  key_component_count,
2165  true,
2166  &join_column_per_key[0],
2167  &type_info_per_key[0],
2168  &sd_inner_to_outer_translation_maps[0],
2169  &sd_min_inner_elems[0]);
2171  (buff,
2172  composite_key_dict,
2173  hash_entry_count,
2174  &key_handler,
2175  join_column_per_key[0].num_elems,
2176  for_window_framing,
2177  cpu_thread_idx,
2178  cpu_thread_count);
2179  }));
2180  }
2181  }
2182 
2183  for (auto& child : rowid_threads) {
2184  child.get();
2185  }
2186 }
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:305
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() fill_row_ids_baseline(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *f, const int64_t num_elems, const bool for_window_framing, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() count_matches_baseline(int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed,
const bool  for_window_framing 
)

Definition at line 2188 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2201  {
2202  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2203  composite_key_dict,
2204  hash_entry_count,
2205  key_component_count,
2206  join_column_per_key,
2207  type_info_per_key,
2208  join_bucket_info,
2209  sd_inner_to_outer_translation_maps,
2210  sd_min_inner_elems,
2211  cpu_thread_count,
2212  is_range_join,
2213  is_geo_compressed,
2214  for_window_framing);
2215 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed,
const bool  for_window_framing 
)

Definition at line 2217 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2230  {
2231  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2232  composite_key_dict,
2233  hash_entry_count,
2234  key_component_count,
2235  join_column_per_key,
2236  type_info_per_key,
2237  join_bucket_info,
2238  sd_inner_to_outer_translation_maps,
2239  sd_min_inner_elems,
2240  cpu_thread_count,
2241  is_range_join,
2242  is_geo_compressed,
2243  for_window_framing);
2244 }

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table ( OneToManyPerfectJoinHashTableFillFuncArgs const  args,
const int32_t  cpu_thread_count 
)

Definition at line 1563 of file HashJoinRuntime.cpp.

References run_benchmark_import::args, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), OneToManyPerfectJoinHashTableFillFuncArgs::for_window_framing, OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, OneToManyPerfectJoinHashTableFillFuncArgs::min_inner_elem, OneToManyPerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1564  {
1565  auto timer = DEBUG_TIMER(__func__);
1566  auto const buff = args.buff;
1567  auto const hash_entry_info = args.hash_entry_info;
1568  auto launch_count_matches = [count_buff =
1569  buff + hash_entry_info.bucketized_hash_entry_count,
1570  &args](auto cpu_thread_idx, auto cpu_thread_count) {
1572  (count_buff,
1573  args.join_column,
1574  args.type_info,
1575  args.sd_inner_to_outer_translation_map,
1576  args.min_inner_elem,
1577  cpu_thread_idx,
1578  cpu_thread_count);
1579  };
1580  auto launch_fill_row_ids =
1581  [hash_entry_count = hash_entry_info.bucketized_hash_entry_count, buff, args](
1582  auto cpu_thread_idx, auto cpu_thread_count) {
1584  (buff,
1585  hash_entry_count,
1586  args.join_column,
1587  args.type_info,
1588  args.for_window_framing,
1589  args.sd_inner_to_outer_translation_map,
1590  args.min_inner_elem,
1591  cpu_thread_idx,
1592  cpu_thread_count);
1593  };
1594 
1596  hash_entry_info.bucketized_hash_entry_count,
1597  args.join_column,
1598  args.type_info,
1599  args.sd_inner_to_outer_translation_map,
1600  args.min_inner_elem,
1601  cpu_thread_count,
1602  args.for_window_framing,
1603  launch_count_matches,
1604  launch_fill_row_ids);
1605 }
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define DEBUG_TIMER(name)
Definition: Logger.h:412
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( OneToManyPerfectJoinHashTableFillFuncArgs const  args,
const int32_t  cpu_thread_count 
)

Definition at line 1607 of file HashJoinRuntime.cpp.

References run_benchmark_import::args, BucketizedHashEntryInfo::bucket_normalization, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, OneToManyPerfectJoinHashTableFillFuncArgs::min_inner_elem, OneToManyPerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1609  {
1610  auto timer = DEBUG_TIMER(__func__);
1611  auto const buff = args.buff;
1612  auto const hash_entry_info = args.hash_entry_info;
1613  auto bucket_normalization = hash_entry_info.bucket_normalization;
1614  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1615  auto launch_count_matches = [bucket_normalization,
1616  count_buff = buff + hash_entry_count,
1617  &args](auto cpu_thread_idx, auto cpu_thread_count) {
1619  (count_buff,
1620  args.join_column,
1621  args.type_info,
1622  args.sd_inner_to_outer_translation_map,
1623  args.min_inner_elem,
1624  cpu_thread_idx,
1625  cpu_thread_count,
1626  bucket_normalization);
1627  };
1628  auto launch_fill_row_ids = [bucket_normalization, hash_entry_count, buff, args](
1629  auto cpu_thread_idx, auto cpu_thread_count) {
1631  (buff,
1632  hash_entry_count,
1633  args.join_column,
1634  args.type_info,
1635  args.sd_inner_to_outer_translation_map,
1636  args.min_inner_elem,
1637  cpu_thread_idx,
1638  cpu_thread_count,
1639  bucket_normalization);
1640  };
1641 
1643  hash_entry_count,
1644  args.join_column,
1645  args.type_info,
1646  args.sd_inner_to_outer_translation_map,
1647  args.min_inner_elem,
1648  cpu_thread_count,
1649  false,
1650  launch_count_matches,
1651  launch_fill_row_ids);
1652 }
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
#define DEBUG_TIMER(name)
Definition: Logger.h:412
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_count,
const bool  for_window_framing,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_func,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_func 
)

Definition at line 1502 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, and inclusive_scan().

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_bucketized().

1511  {
1512  auto timer = DEBUG_TIMER(__func__);
1513  int32_t* pos_buff = buff;
1514  int32_t* count_buff = buff + hash_entry_count;
1515  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1516  std::vector<std::future<void>> counter_threads;
1517  for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1518  counter_threads.push_back(std::async(
1519  std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));
1520  }
1521 
1522  for (auto& child : counter_threads) {
1523  child.get();
1524  }
1525 
1526  std::vector<int32_t> count_copy(hash_entry_count, 0);
1527  CHECK_GT(hash_entry_count, int64_t(0));
1528  memcpy(count_copy.data() + 1, count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1529 #if HAVE_CUDA
1530  thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());
1531 #else
1533  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1534 #endif
1535  std::vector<std::future<void>> pos_threads;
1536  for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1537  pos_threads.push_back(std::async(
1539  [&](size_t thread_idx) {
1540  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1541  if (count_buff[i]) {
1542  pos_buff[i] = count_copy[i];
1543  }
1544  }
1545  },
1546  cpu_thread_idx));
1547  }
1548  for (auto& child : pos_threads) {
1549  child.get();
1550  }
1551  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1552  std::vector<std::future<void>> rowid_threads;
1553  for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1554  rowid_threads.push_back(std::async(
1555  std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));
1556  }
1557 
1558  for (auto& child : rowid_threads) {
1559  child.get();
1560  }
1561 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:305
future< Result > async(Fn &&fn, Args &&...args)
#define DEBUG_TIMER(name)
Definition: Logger.h:412

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_count 
)

Definition at line 1714 of file HashJoinRuntime.cpp.

References count_matches_sharded(), fill_one_to_many_hash_table_sharded_impl(), fill_row_ids_sharded(), and SUFFIX.

1721  {
1722  auto launch_count_matches = [count_buff = buff + hash_entry_count,
1723  &join_column,
1724  &type_info,
1725  &shard_info
1726 #ifndef __CUDACC__
1727  ,
1728  sd_inner_to_outer_translation_map,
1729  min_inner_elem
1730 #endif
1731  ](auto cpu_thread_idx, auto cpu_thread_count) {
1732  return SUFFIX(count_matches_sharded)(count_buff,
1733  join_column,
1734  type_info,
1735  shard_info
1736 #ifndef __CUDACC__
1737  ,
1738  sd_inner_to_outer_translation_map,
1739  min_inner_elem,
1740  cpu_thread_idx,
1741  cpu_thread_count
1742 #endif
1743  );
1744  };
1745 
1746  auto launch_fill_row_ids = [buff,
1747  hash_entry_count,
1748  &join_column,
1749  &type_info,
1750  &shard_info
1751 #ifndef __CUDACC__
1752  ,
1753  sd_inner_to_outer_translation_map,
1754  min_inner_elem
1755 #endif
1756  ](auto cpu_thread_idx, auto cpu_thread_count) {
1757  return SUFFIX(fill_row_ids_sharded)(buff,
1758  hash_entry_count,
1759  join_column,
1760  type_info,
1761  shard_info
1762 #ifndef __CUDACC__
1763  ,
1764  sd_inner_to_outer_translation_map,
1765  min_inner_elem,
1766  cpu_thread_idx,
1767  cpu_thread_count);
1768 #endif
1769  };
1770 
1772  hash_entry_count,
1773  join_column,
1774  type_info,
1775  shard_info
1776 #ifndef __CUDACC__
1777  ,
1778  sd_inner_to_outer_translation_map,
1779  min_inner_elem,
1780  cpu_thread_count
1781 #endif
1782  ,
1783  launch_count_matches,
1784  launch_fill_row_ids);
1785 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_launcher,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_launcher 
)

Definition at line 1655 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, and inclusive_scan().

Referenced by fill_one_to_many_hash_table_sharded().

1665  {
1666  auto timer = DEBUG_TIMER(__func__);
1667  int32_t* pos_buff = buff;
1668  int32_t* count_buff = buff + hash_entry_count;
1669  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1670  std::vector<std::future<void>> counter_threads;
1671  for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1672  counter_threads.push_back(std::async(
1673  std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));
1674  }
1675 
1676  for (auto& child : counter_threads) {
1677  child.get();
1678  }
1679 
1680  std::vector<int32_t> count_copy(hash_entry_count, 0);
1681  CHECK_GT(hash_entry_count, int64_t(0));
1682  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1684  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1685  std::vector<std::future<void>> pos_threads;
1686  for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1687  pos_threads.push_back(std::async(
1689  [&](const int32_t thread_idx) {
1690  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1691  if (count_buff[i]) {
1692  pos_buff[i] = count_copy[i];
1693  }
1694  }
1695  },
1696  cpu_thread_idx));
1697  }
1698  for (auto& child : pos_threads) {
1699  child.get();
1700  }
1701 
1702  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1703  std::vector<std::future<void>> rowid_threads;
1704  for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1705  rowid_threads.push_back(std::async(
1706  std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));
1707  }
1708 
1709  for (auto& child : rowid_threads) {
1710  child.get();
1711  }
1712 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:305
future< Result > async(Fn &&fn, Args &&...args)
#define DEBUG_TIMER(name)
Definition: Logger.h:412

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const bool  for_window_framing,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1026 of file HashJoinRuntime.cpp.

References fill_row_ids_for_window_framing_impl(), fill_row_ids_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

1038  {
1039  auto slot_sel = [&type_info](auto pos_buff, auto elem) {
1040  return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);
1041  };
1042 
1043  if (!for_window_framing) {
1044  fill_row_ids_impl(buff,
1045  hash_entry_count,
1046  join_column,
1047  type_info
1048 #ifndef __CUDACC__
1049  ,
1050  sd_inner_to_outer_translation_map,
1051  min_inner_elem,
1052  cpu_thread_idx,
1053  cpu_thread_count
1054 #endif
1055  ,
1056  slot_sel);
1057  } else {
1059  hash_entry_count,
1060  join_column,
1061  type_info
1062 #ifndef __CUDACC__
1063  ,
1064  sd_inner_to_outer_translation_map,
1065  min_inner_elem,
1066  cpu_thread_idx,
1067  cpu_thread_count
1068 #endif
1069  ,
1070  slot_sel);
1071  }
1072 }
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76
const int64_t min_val
DEVICE void fill_row_ids_for_window_framing_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const bool  for_window_framing,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1250 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and heavydb.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

1261  {
1262  int32_t* pos_buff = buff;
1263  int32_t* count_buff = buff + hash_entry_count;
1264  int32_t* id_buff = count_buff + hash_entry_count;
1265  int32_t* reversed_id_buff = for_window_framing ? id_buff + num_elems : nullptr;
1266 #ifdef __CUDACC__
1267  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1268  int32_t step = blockDim.x * gridDim.x;
1269 #else
1270  int32_t start = cpu_thread_idx;
1271  int32_t step = cpu_thread_count;
1272 #endif
1273 
1274  T key_scratch_buff[g_maximum_conditions_to_coalesce];
1275 #ifdef __CUDACC__
1276  assert(composite_key_dict);
1277 #endif
1278  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
1279  auto key_buff_handler = [composite_key_dict,
1280  hash_entry_count,
1281  pos_buff,
1282  count_buff,
1283  id_buff,
1284  reversed_id_buff,
1285  key_size_in_bytes,
1286  for_window_framing](const int64_t row_index,
1287  const T* key_scratch_buff,
1288  const size_t key_component_count) {
1289  const T* matching_group =
1291  key_component_count,
1292  composite_key_dict,
1293  hash_entry_count,
1294  key_size_in_bytes);
1295  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
1296  int32_t* pos_ptr = pos_buff + entry_idx;
1297  const auto bin_idx = pos_ptr - pos_buff;
1298  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1299  id_buff[id_buff_idx] = static_cast<int32_t>(row_index);
1300  if (for_window_framing) {
1301  reversed_id_buff[row_index] = id_buff_idx;
1302  }
1303  return 0;
1304  };
1305 
1306  JoinColumnTuple cols(
1307  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1308  for (auto& it : cols.slice(start, step)) {
1309  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1310  }
1311  return;
1312 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 1074 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

1087  {
1088  auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {
1089  return SUFFIX(get_bucketized_hash_slot)(pos_buff,
1090  elem,
1091  type_info.min_val / bucket_normalization,
1092  type_info.translated_null_val,
1093  bucket_normalization);
1094  };
1095 
1096  fill_row_ids_impl(buff,
1097  hash_entry_count,
1098  join_column,
1099  type_info
1100 #ifndef __CUDACC__
1101  ,
1102  sd_inner_to_outer_translation_map,
1103  min_inner_elem,
1104  cpu_thread_idx,
1105  cpu_thread_count
1106 #endif
1107  ,
1108  slot_sel);
1109 }
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
#define SUFFIX(name)
const int64_t translated_null_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_for_window_framing_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 950 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, and JoinColumn::num_elems.

Referenced by fill_row_ids().

963  {
964  int32_t* pos_buff = buff;
965  int32_t* count_buff = buff + hash_entry_count;
966  int32_t* id_buff = count_buff + hash_entry_count;
967  int32_t* reversed_id_buff = id_buff + join_column.num_elems;
968 #ifdef __CUDACC__
969  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
970  int32_t step = blockDim.x * gridDim.x;
971 #else
972  int32_t start = cpu_thread_idx;
973  int32_t step = cpu_thread_count;
974 
975 #endif
976  if (join_column.num_elems == 0) {
977  return;
978  }
979  JoinColumnTyped col{&join_column, &type_info};
980  bool all_nulls = hash_entry_count == 1 && type_info.min_val == 0 &&
981  type_info.max_val == -1 &&
982  (*col.begin()).element == type_info.null_val;
983  if (all_nulls) {
984  int32_t thread_idx = -1;
985 #ifdef __CUDACC__
986  thread_idx = threadIdx.x;
987 #else
988  thread_idx = cpu_thread_idx;
989 #endif
990  if (thread_idx == 0) {
991  pos_buff[0] = 0;
992  count_buff[0] = join_column.num_elems - 1;
993  for (size_t i = 0; i < join_column.num_elems; i++) {
994  reversed_id_buff[i] = i;
995  }
996  }
997  return;
998  }
999  for (auto item : col.slice(start, step)) {
1000  const size_t index = item.index;
1001  int64_t elem = item.element;
1002  if (elem == type_info.null_val) {
1003  elem = type_info.translated_null_val;
1004  }
1005 #ifndef __CUDACC__
1006  if (sd_inner_to_outer_translation_map && elem != type_info.translated_null_val) {
1007  const auto outer_id = map_str_id_to_outer_dict(elem,
1008  min_inner_elem,
1009  type_info.min_val,
1010  type_info.max_val,
1011  sd_inner_to_outer_translation_map);
1012  if (outer_id == StringDictionary::INVALID_STR_ID) {
1013  continue;
1014  }
1015  elem = outer_id;
1016  }
1017 #endif
1018  auto pos_ptr = slot_selector(pos_buff, elem);
1019  const auto bin_idx = pos_ptr - pos_buff;
1020  auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1021  id_buff[id_buff_idx] = static_cast<int32_t>(index);
1022  reversed_id_buff[index] = id_buff_idx;
1023  }
1024 }
size_t num_elems
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 894 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by fill_row_ids(), fill_row_ids_bucketized(), fill_row_ids_sharded(), and fill_row_ids_sharded_bucketized().

906  {
907  int32_t* pos_buff = buff;
908  int32_t* count_buff = buff + hash_entry_count;
909  int32_t* id_buff = count_buff + hash_entry_count;
910 #ifdef __CUDACC__
911  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
912  int32_t step = blockDim.x * gridDim.x;
913 #else
914  int32_t start = cpu_thread_idx;
915  int32_t step = cpu_thread_count;
916 #endif
917  JoinColumnTyped col{&join_column, &type_info};
918  for (auto item : col.slice(start, step)) {
919  const size_t index = item.index;
920  int64_t elem = item.element;
921  if (elem == type_info.null_val) {
922  if (type_info.uses_bw_eq) {
923  elem = type_info.translated_null_val;
924  } else {
925  continue;
926  }
927  }
928 #ifndef __CUDACC__
929  if (sd_inner_to_outer_translation_map &&
930  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
931  const auto outer_id = map_str_id_to_outer_dict(elem,
932  min_inner_elem,
933  type_info.min_val,
934  type_info.max_val,
935  sd_inner_to_outer_translation_map);
936  if (outer_id == StringDictionary::INVALID_STR_ID) {
937  continue;
938  }
939  elem = outer_id;
940  }
941 #endif
942  auto pos_ptr = slot_selector(pos_buff, elem);
943  const auto bin_idx = pos_ptr - pos_buff;
944  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
945  id_buff[id_buff_idx] = static_cast<int32_t>(index);
946  }
947 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1170 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot_sharded(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

1182  {
1183  auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {
1184  return SUFFIX(get_hash_slot_sharded)(pos_buff,
1185  elem,
1186  type_info.min_val,
1187  shard_info.entry_count_per_shard,
1188  shard_info.num_shards,
1189  shard_info.device_count);
1190  };
1191  fill_row_ids_impl(buff,
1192  hash_entry_count,
1193  join_column,
1194  type_info
1195 #ifndef __CUDACC__
1196  ,
1197  sd_inner_to_outer_translation_map,
1198  min_inner_elem,
1199  cpu_thread_idx,
1200  cpu_thread_count
1201 #endif
1202  ,
1203  slot_sel);
1204 }
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
#define SUFFIX(name)
size_t num_shards
size_t entry_count_per_shard
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:108

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 1206 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot_sharded(), and SUFFIX.

1220  {
1221  auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,
1222  auto elem) {
1224  pos_buff,
1225  elem,
1226  type_info.min_val / bucket_normalization,
1227  type_info.translated_null_val,
1228  shard_info.entry_count_per_shard,
1229  shard_info.num_shards,
1230  shard_info.device_count,
1231  bucket_normalization);
1232  };
1233 
1234  fill_row_ids_impl(buff,
1235  hash_entry_count,
1236  join_column,
1237  type_info
1238 #ifndef __CUDACC__
1239  ,
1240  sd_inner_to_outer_translation_map,
1241  min_inner_elem,
1242  cpu_thread_idx,
1243  cpu_thread_count
1244 #endif
1245  ,
1246  slot_sel);
1247 }
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
#define SUFFIX(name)
size_t num_shards
size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:90

+ Here is the call graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 1112 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

1125  {
1126 
1127  int32_t* pos_buff = buff;
1128  int32_t* count_buff = buff + hash_entry_count;
1129  int32_t* id_buff = count_buff + hash_entry_count;
1130 
1131 #ifdef __CUDACC__
1132  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1133  int32_t step = blockDim.x * gridDim.x;
1134 #else
1135  int32_t start = cpu_thread_idx;
1136  int32_t step = cpu_thread_count;
1137 #endif
1138  JoinColumnTyped col{&join_column, &type_info};
1139  for (auto item : col.slice(start, step)) {
1140  const size_t index = item.index;
1141  int64_t elem = item.element;
1142  if (elem == type_info.null_val) {
1143  if (type_info.uses_bw_eq) {
1144  elem = type_info.translated_null_val;
1145  } else {
1146  continue;
1147  }
1148  }
1149 #ifndef __CUDACC__
1150  if (sd_inner_to_outer_translation_map &&
1151  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
1152  const auto outer_id = map_str_id_to_outer_dict(elem,
1153  min_inner_elem,
1154  type_info.min_val,
1155  type_info.max_val,
1156  sd_inner_to_outer_translation_map);
1157  if (outer_id == StringDictionary::INVALID_STR_ID) {
1158  continue;
1159  }
1160  elem = outer_id;
1161  }
1162 #endif
1163  auto* pos_ptr = slot_selector(pos_buff, elem);
1164  const auto bin_idx = pos_ptr - pos_buff;
1165  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1166  id_buff[id_buff_idx] = static_cast<int32_t>(index);
1167  }
1168 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

template<typename T >
T* get_matching_baseline_hash_slot_at ( int8_t *  hash_buff,
const uint32_t  h,
const T *  key,
const size_t  key_component_count,
const int64_t  hash_entry_size 
)

Definition at line 465 of file HashJoinRuntime.cpp.

References cas_cst, get_invalid_key(), load_cst, store_cst, SUFFIX, heavydb.dtypes::T, and UNLIKELY.

Referenced by write_baseline_hash_slot(), and write_baseline_hash_slot_for_semi_join().

469  {
470  uint32_t off = h * hash_entry_size;
471  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
472  T empty_key = SUFFIX(get_invalid_key)<T>();
473  T write_pending = SUFFIX(get_invalid_key)<T>() - 1;
474  if (UNLIKELY(*key == write_pending)) {
475  // Address the singularity case where the first column contains the pending
476  // write special value. Should never happen, but avoid doing wrong things.
477  return nullptr;
478  }
479  const bool success = cas_cst(row_ptr, &empty_key, write_pending);
480  if (success) {
481  if (key_component_count > 1) {
482  memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));
483  }
484  store_cst(row_ptr, *key);
485  return reinterpret_cast<T*>(row_ptr + key_component_count);
486  }
487  while (load_cst(row_ptr) == write_pending) {
488  // spin until the winning thread has finished writing the entire key
489  }
490  for (size_t i = 0; i < key_component_count; ++i) {
491  if (load_cst(row_ptr + i) != key[i]) {
492  return nullptr;
493  }
494  }
495  return reinterpret_cast<T*>(row_ptr + key_component_count);
496 }
#define SUFFIX(name)
#define load_cst(ptr)
DEVICE T SUFFIX() get_invalid_key()
#define cas_cst(ptr, expected, desired)
#define UNLIKELY(x)
Definition: likely.h:25
#define store_cst(ptr, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE NEVER_INLINE const T* SUFFIX() get_matching_baseline_hash_slot_readonly ( const T *  key,
const size_t  key_component_count,
const T *  composite_key_dict,
const int64_t  entry_count,
const size_t  key_size_in_bytes 
)

Definition at line 818 of file HashJoinRuntime.cpp.

References CHECK, keys_are_equal(), and MurmurHash1Impl().

Referenced by count_matches_baseline(), and fill_row_ids_baseline().

823  {
824  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
825  uint32_t off = h * key_component_count;
826  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
827  return &composite_key_dict[off];
828  }
829  uint32_t h_probe = (h + 1) % entry_count;
830  while (h_probe != h) {
831  off = h_probe * key_component_count;
832  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
833  return &composite_key_dict[off];
834  }
835  h_probe = (h_probe + 1) % entry_count;
836  }
837 #ifndef __CUDACC__
838  CHECK(false);
839 #else
840  assert(false);
841 #endif
842  return nullptr;
843 }
bool keys_are_equal(const T *key1, const T *key2, const size_t key_component_count)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename InputIterator , typename OutputIterator >
void inclusive_scan ( InputIterator  first,
InputIterator  last,
OutputIterator  out,
const size_t  thread_count 
)

Definition at line 1430 of file HashJoinRuntime.cpp.

References threading_serial::async(), and gpu_enabled::partial_sum().

Referenced by approximate_distinct_tuples_bbox_intersect(), approximate_distinct_tuples_on_device_bbox_intersect(), approximate_distinct_tuples_on_device_range(), approximate_distinct_tuples_range(), fill_one_to_many_baseline_hash_table(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_impl(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), fill_one_to_many_hash_table_sharded_impl(), and gpu_enabled::partial_sum().

1433  {
1434  using ElementType = typename InputIterator::value_type;
1435  using OffsetType = typename InputIterator::difference_type;
1436  const OffsetType elem_count = last - first;
1437  if (elem_count < 10000 || thread_count <= 1) {
1438  ElementType sum = 0;
1439  for (auto iter = first; iter != last; ++iter, ++out) {
1440  *out = sum += *iter;
1441  }
1442  return;
1443  }
1444 
1445  const OffsetType step = (elem_count + thread_count - 1) / thread_count;
1446  OffsetType start_off = 0;
1447  OffsetType end_off = std::min(step, elem_count);
1448  std::vector<ElementType> partial_sums(thread_count);
1449  std::vector<std::future<void>> counter_threads;
1450  for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,
1451  start_off = std::min(start_off + step, elem_count),
1452  end_off = std::min(start_off + step, elem_count)) {
1453  counter_threads.push_back(std::async(
1455  [first, out](
1456  ElementType& partial_sum, const OffsetType start, const OffsetType end) {
1457  ElementType sum = 0;
1458  for (auto in_iter = first + start, out_iter = out + start;
1459  in_iter != (first + end);
1460  ++in_iter, ++out_iter) {
1461  *out_iter = sum += *in_iter;
1462  }
1463  partial_sum = sum;
1464  },
1465  std::ref(partial_sums[thread_idx]),
1466  start_off,
1467  end_off));
1468  }
1469  for (auto& child : counter_threads) {
1470  child.get();
1471  }
1472 
1473  ElementType sum = 0;
1474  for (auto& s : partial_sums) {
1475  s += sum;
1476  sum = s;
1477  }
1478 
1479  counter_threads.clear();
1480  start_off = std::min(step, elem_count);
1481  end_off = std::min(start_off + step, elem_count);
1482  for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,
1483  start_off = std::min(start_off + step, elem_count),
1484  end_off = std::min(start_off + step, elem_count)) {
1485  counter_threads.push_back(std::async(
1487  [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {
1488  for (auto iter = out + start; iter != (out + end); ++iter) {
1489  *iter += prev_sum;
1490  }
1491  },
1492  partial_sums[thread_idx],
1493  start_off,
1494  end_off));
1495  }
1496  for (auto& child : counter_threads) {
1497  child.get();
1498  }
1499 }
future< Result > async(Fn &&fn, Args &&...args)
DEVICE void partial_sum(ARGS &&...args)
Definition: gpu_enabled.h:87

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 346 of file HashJoinRuntime.cpp.

References get_invalid_key(), SUFFIX, and heavydb.dtypes::T.

Referenced by init_baseline_hash_join_buff_wrapper().

352  {
353 #ifdef __CUDACC__
354  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
355  int32_t step = blockDim.x * gridDim.x;
356 #else
357  int32_t start = cpu_thread_idx;
358  int32_t step = cpu_thread_count;
359 #endif
360  auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
361  const T empty_key = SUFFIX(get_invalid_key)<T>();
362  for (int64_t h = start; h < entry_count; h += step) {
363  int64_t off = h * hash_entry_size;
364  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
365  for (size_t i = 0; i < key_component_count; ++i) {
366  row_ptr[i] = empty_key;
367  }
368  if (with_val_slot) {
369  row_ptr[key_component_count] = invalid_slot_val;
370  }
371  }
372 }
#define SUFFIX(name)
DEVICE T SUFFIX() get_invalid_key()

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1787 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1793  {
1794  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1795  entry_count,
1796  key_component_count,
1797  with_val_slot,
1798  invalid_slot_val,
1799  cpu_thread_idx,
1800  cpu_thread_count);
1801 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1803 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1809  {
1810  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1811  entry_count,
1812  key_component_count,
1813  with_val_slot,
1814  invalid_slot_val,
1815  cpu_thread_idx,
1816  cpu_thread_count);
1817 }

+ Here is the caller graph for this function:

DEVICE void SUFFIX() init_hash_join_buff ( int32_t *  groups_buffer,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 70 of file HashJoinRuntime.cpp.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

74  {
75 #ifdef __CUDACC__
76  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
77  int32_t step = blockDim.x * gridDim.x;
78 #else
79  int32_t start = cpu_thread_idx;
80  int32_t step = cpu_thread_count;
81 #endif
82  for (int64_t i = start; i < hash_entry_count; i += step) {
83  groups_buffer[i] = invalid_slot_val;
84  }
85 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1887 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1895  {
1896  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1897  entry_count,
1898  invalid_slot_val,
1899  false,
1900  key_component_count,
1901  with_val_slot,
1902  key_handler,
1903  num_elems,
1904  cpu_thread_idx,
1905  cpu_thread_count);
1906 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1952 of file HashJoinRuntime.cpp.

1960  {
1961  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1962  entry_count,
1963  invalid_slot_val,
1964  false,
1965  key_component_count,
1966  with_val_slot,
1967  key_handler,
1968  num_elems,
1969  cpu_thread_idx,
1970  cpu_thread_count);
1971 }
template<typename T >
DEVICE int write_baseline_hash_slot ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 505 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and heavydb.dtypes::T.

513  {
514  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
515  T* matching_group = get_matching_baseline_hash_slot_at(
516  hash_buff, h, key, key_component_count, hash_entry_size);
517  if (!matching_group) {
518  uint32_t h_probe = (h + 1) % entry_count;
519  while (h_probe != h) {
520  matching_group = get_matching_baseline_hash_slot_at(
521  hash_buff, h_probe, key, key_component_count, hash_entry_size);
522  if (matching_group) {
523  break;
524  }
525  h_probe = (h_probe + 1) % entry_count;
526  }
527  }
528  if (!matching_group) {
529  return -2;
530  }
531  if (!with_val_slot) {
532  return 0;
533  }
534  if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {
535  return -1;
536  }
537  return 0;
538 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function:

template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 541 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and heavydb.dtypes::T.

549  {
550  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
551  T* matching_group = get_matching_baseline_hash_slot_at(
552  hash_buff, h, key, key_component_count, hash_entry_size);
553  if (!matching_group) {
554  uint32_t h_probe = (h + 1) % entry_count;
555  while (h_probe != h) {
556  matching_group = get_matching_baseline_hash_slot_at(
557  hash_buff, h_probe, key, key_component_count, hash_entry_size);
558  if (matching_group) {
559  break;
560  }
561  h_probe = (h_probe + 1) % entry_count;
562  }
563  }
564  if (!matching_group) {
565  return -2;
566  }
567  if (!with_val_slot) {
568  return 0;
569  }
570  mapd_cas(matching_group, invalid_slot_val, val);
571  return 0;
572 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function: