OmniSciDB  91042dcc5b
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.cpp File Reference
+ Include dependency graph for HashJoinRuntime.cpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

 anonymous_namespace{HashJoinRuntime.cpp}
 

Macros

#define mapd_cas(address, compare, val)   __sync_val_compare_and_swap(address, compare, val)
 
#define cas_cst(ptr, expected, desired)
 
#define store_cst(ptr, val)   __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)
 
#define load_cst(ptr)   __atomic_load_n(ptr, __ATOMIC_SEQ_CST)
 
#define mapd_add(address, val)   __sync_fetch_and_add(address, val)
 

Functions

int64_t anonymous_namespace{HashJoinRuntime.cpp}::translate_str_id_to_outer_dict (const int64_t elem, const int64_t min_elem, const int64_t max_elem, const void *sd_inner_proxy, const void *sd_outer_proxy)
 
int64_t anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict (const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
 
DEVICE void SUFFIX() init_hash_join_buff (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void SUFFIX() init_hash_join_buff_tbb (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
T * get_matching_baseline_hash_slot_at (int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() count_matches (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() count_matches_bucketized (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
GLOBAL void SUFFIX() count_matches_sharded (int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE NEVER_INLINE const T
*SUFFIX() 
get_matching_baseline_hash_slot_readonly (const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline (int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_bucketized (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids_sharded (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl (uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl (double *bucket_sizes_for_thread, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename InputIterator , typename OutputIterator >
void inclusive_scan (InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
 
void fill_one_to_many_hash_table_sharded (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
void fill_one_to_many_baseline_hash_table (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const size_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 

Macro Definition Documentation

#define cas_cst (   ptr,
  expected,
  desired 
)
Value:
__atomic_compare_exchange_n( \
ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

Definition at line 466 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define load_cst (   ptr)    __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

Definition at line 470 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define mapd_add (   address,
  val 
)    __sync_fetch_and_add(address, val)
#define mapd_cas (   address,
  compare,
  val 
)    __sync_val_compare_and_swap(address, compare, val)
#define store_cst (   ptr,
  val 
)    __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

Definition at line 469 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2179 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2184  {
2185  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2186  CHECK(!join_column_per_key.empty());
2187 
2188  std::vector<std::future<void>> approx_distinct_threads;
2189  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2190  approx_distinct_threads.push_back(std::async(
2192  [&join_column_per_key,
2193  &type_info_per_key,
2194  b,
2195  hll_buffer_all_cpus,
2196  padded_size_bytes,
2197  thread_idx,
2198  thread_count] {
2199  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2200 
2201  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2202  false,
2203  &join_column_per_key[0],
2204  &type_info_per_key[0],
2205  nullptr,
2206  nullptr);
2208  nullptr,
2209  b,
2210  join_column_per_key[0].num_elems,
2211  &key_handler,
2212  thread_idx,
2213  thread_count);
2214  }));
2215  }
2216  for (auto& child : approx_distinct_threads) {
2217  child.get();
2218  }
2219 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:211

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl ( uint8_t *  hll_buffer,
int32_t *  row_count_buffer,
const uint32_t  b,
const int64_t  num_elems,
const KEY_HANDLER *  f,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1236 of file HashJoinRuntime.cpp.

References atomicMax(), g_maximum_conditions_to_coalesce, get_rank(), and MurmurHash64AImpl().

Referenced by approximate_distinct_tuples(), approximate_distinct_tuples_overlaps(), and approximate_distinct_tuples_range().

1246  {
1247 #ifdef __CUDACC__
1248  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1249  int32_t step = blockDim.x * gridDim.x;
1250 #else
1251  int32_t start = cpu_thread_idx;
1252  int32_t step = cpu_thread_count;
1253 #endif
1254 
1255  auto key_buff_handler = [b, hll_buffer, row_count_buffer](
1256  const int64_t entry_idx,
1257  const int64_t* key_scratch_buff,
1258  const size_t key_component_count) {
1259  if (row_count_buffer) {
1260  row_count_buffer[entry_idx] += 1;
1261  }
1262 
1263  const uint64_t hash =
1264  MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);
1265  const uint32_t index = hash >> (64 - b);
1266  const auto rank = get_rank(hash << b, 64 - b);
1267 #ifdef __CUDACC__
1268  atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);
1269 #else
1270  hll_buffer[index] = std::max(hll_buffer[index], rank);
1271 #endif
1272 
1273  return 0;
1274  };
1275 
1276  int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];
1277 
1278  JoinColumnTuple cols(
1279  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1280  for (auto& it : cols.slice(start, step)) {
1281  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1282  }
1283 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
char * f
__device__ double atomicMax(double *address, double val)
const size_t g_maximum_conditions_to_coalesce
FORCE_INLINE DEVICE uint64_t MurmurHash64AImpl(const void *key, int len, uint64_t seed)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2221 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

2229  {
2230  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2231  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2232  CHECK(!join_column_per_key.empty());
2233 
2234  std::vector<std::future<void>> approx_distinct_threads;
2235  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2236  approx_distinct_threads.push_back(std::async(
2238  [&join_column_per_key,
2239  &join_buckets_per_key,
2240  &row_counts,
2241  b,
2242  hll_buffer_all_cpus,
2243  padded_size_bytes,
2244  thread_idx,
2245  thread_count] {
2246  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2247 
2248  const auto key_handler = OverlapsKeyHandler(
2249  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2250  &join_column_per_key[0],
2251  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2253  row_counts.data(),
2254  b,
2255  join_column_per_key[0].num_elems,
2256  &key_handler,
2257  thread_idx,
2258  thread_count);
2259  }));
2260  }
2261  for (auto& child : approx_distinct_threads) {
2262  child.get();
2263  }
2264 
2266  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2267 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:211

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2269 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2278  {
2279  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2280  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2281  CHECK(!join_column_per_key.empty());
2282 
2283  std::vector<std::future<void>> approx_distinct_threads;
2284  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2285  approx_distinct_threads.push_back(std::async(
2287  [&join_column_per_key,
2288  &join_buckets_per_key,
2289  &row_counts,
2290  b,
2291  hll_buffer_all_cpus,
2292  padded_size_bytes,
2293  thread_idx,
2294  is_compressed,
2295  thread_count] {
2296  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2297 
2298  const auto key_handler = RangeKeyHandler(
2299  is_compressed,
2300  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2301  &join_column_per_key[0],
2302  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2304  row_counts.data(),
2305  b,
2306  join_column_per_key[0].num_elems,
2307  &key_handler,
2308  thread_idx,
2309  thread_count);
2310  }));
2311  }
2312  for (auto& child : approx_distinct_threads) {
2313  child.get();
2314  }
2315 
2317  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2318 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:211

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl ( double *  bucket_sizes_for_thread,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1306 of file HashJoinRuntime.cpp.

References atomicMin(), fixed_width_double_decode_noinline(), anonymous_namespace{Utm.h}::N, JoinColumnIterator::ptr(), and SUFFIX.

1315  {
1316 #ifdef __CUDACC__
1317  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1318  int32_t step = blockDim.x * gridDim.x;
1319 #else
1320  int32_t start = cpu_thread_idx;
1321  int32_t step = cpu_thread_count;
1322 #endif
1323  JoinColumnIterator it(join_column, type_info, start, step);
1324  for (; it; ++it) {
1325  // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)
1326  double bounds[2 * N];
1327  for (size_t j = 0; j < 2 * N; j++) {
1328  bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);
1329  }
1330 
1331  for (size_t j = 0; j < N; j++) {
1332  const auto diff = bounds[j + N] - bounds[j];
1333 #ifdef __CUDACC__
1334  if (diff > bucket_size_thresholds[j]) {
1335  atomicMin(&bucket_sizes_for_thread[j], diff);
1336  }
1337 #else
1338  if (diff < bucket_size_thresholds[j] && diff > bucket_sizes_for_thread[j]) {
1339  bucket_sizes_for_thread[j] = diff;
1340  }
1341 #endif
1342  }
1343  }
1344 }
__device__ double atomicMin(double *address, double val)
#define SUFFIX(name)
Iterates over the rows of a JoinColumn across multiple fragments/chunks.
constexpr unsigned N
Definition: Utm.h:110
DEVICE NEVER_INLINE double SUFFIX() fixed_width_double_decode_noinline(const int8_t *byte_stream, const int64_t pos)
Definition: DecodersImpl.h:134

+ Here is the call graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2320 of file HashJoinRuntime.cpp.

References threading_serial::async(), and i.

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

2324  {
2325  std::vector<std::vector<double>> bucket_sizes_for_threads;
2326  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2327  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2328  }
2329  std::vector<std::future<void>> threads;
2330  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2331  threads.push_back(std::async(std::launch::async,
2332  compute_bucket_sizes_impl<2>,
2333  bucket_sizes_for_threads[thread_idx].data(),
2334  &join_column,
2335  &type_info,
2336  bucket_size_thresholds.data(),
2337  thread_idx,
2338  thread_count));
2339  }
2340  for (auto& child : threads) {
2341  child.get();
2342  }
2343 
2344  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2345  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2346  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2347  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2348  }
2349  }
2350  }
2351 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 711 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

722  {
723  auto slot_sel = [&type_info](auto count_buff, auto elem) {
724  return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);
725  };
726  count_matches_impl(count_buff,
727  invalid_slot_val,
728  join_column,
729  type_info
730 #ifndef __CUDACC__
731  ,
732  sd_inner_to_outer_translation_map,
733  min_inner_elem,
734  cpu_thread_idx,
735  cpu_thread_count
736 #endif
737  ,
738  slot_sel);
739 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:75
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline ( int32_t *  count_buff,
const T *  composite_key_dict,
const int64_t  entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 858 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and omnisci.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

868  {
869 #ifdef __CUDACC__
870  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
871  int32_t step = blockDim.x * gridDim.x;
872 #else
873  int32_t start = cpu_thread_idx;
874  int32_t step = cpu_thread_count;
875 #endif
876 #ifdef __CUDACC__
877  assert(composite_key_dict);
878 #endif
879  T key_scratch_buff[g_maximum_conditions_to_coalesce];
880  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
881  auto key_buff_handler = [composite_key_dict,
882  entry_count,
883  count_buff,
884  key_size_in_bytes](const int64_t row_entry_idx,
885  const T* key_scratch_buff,
886  const size_t key_component_count) {
887  const auto matching_group =
889  key_component_count,
890  composite_key_dict,
891  entry_count,
892  key_size_in_bytes);
893  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
894  mapd_add(&count_buff[entry_idx], int32_t(1));
895  return 0;
896  };
897 
898  JoinColumnTuple cols(
899  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
900  for (auto& it : cols.slice(start, step)) {
901  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
902  }
903 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
char * f
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_bucketized ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 741 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

754  {
755  auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {
757  count_buff, elem, type_info.min_val, bucket_normalization);
758  };
759  count_matches_impl(count_buff,
760  invalid_slot_val,
761  join_column,
762  type_info
763 #ifndef __CUDACC__
764  ,
765  sd_inner_to_outer_translation_map,
766  min_inner_elem,
767  cpu_thread_idx,
768  cpu_thread_count
769 #endif
770  ,
771  slot_sel);
772 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:67
#define SUFFIX(name)
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 662 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by count_matches(), and count_matches_bucketized().

674  {
675 #ifdef __CUDACC__
676  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
677  int32_t step = blockDim.x * gridDim.x;
678 #else
679  int32_t start = cpu_thread_idx;
680  int32_t step = cpu_thread_count;
681 #endif
682  JoinColumnTyped col{&join_column, &type_info};
683  for (auto item : col.slice(start, step)) {
684  int64_t elem = item.element;
685  if (elem == type_info.null_val) {
686  if (type_info.uses_bw_eq) {
687  elem = type_info.translated_null_val;
688  } else {
689  continue;
690  }
691  }
692 #ifndef __CUDACC__
693  if (sd_inner_to_outer_translation_map &&
694  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
695  const auto outer_id = map_str_id_to_outer_dict(elem,
696  min_inner_elem,
697  type_info.min_val,
698  type_info.max_val,
699  sd_inner_to_outer_translation_map);
700  if (outer_id == StringDictionary::INVALID_STR_ID) {
701  continue;
702  }
703  elem = outer_id;
704  }
705 #endif
706  auto* entry_ptr = slot_selector(count_buff, elem);
707  mapd_add(entry_ptr, int32_t(1));
708  }
709 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_sharded ( int32_t *  count_buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 774 of file HashJoinRuntime.cpp.

References get_hash_slot_sharded(), StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

787  {
788 #ifdef __CUDACC__
789  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
790  int32_t step = blockDim.x * gridDim.x;
791 #else
792  int32_t start = cpu_thread_idx;
793  int32_t step = cpu_thread_count;
794 #endif
795  JoinColumnTyped col{&join_column, &type_info};
796  for (auto item : col.slice(start, step)) {
797  int64_t elem = item.element;
798  if (elem == type_info.null_val) {
799  if (type_info.uses_bw_eq) {
800  elem = type_info.translated_null_val;
801  } else {
802  continue;
803  }
804  }
805 #ifndef __CUDACC__
806  if (sd_inner_to_outer_translation_map &&
807  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
808  const auto outer_id = map_str_id_to_outer_dict(elem,
809  min_inner_elem,
810  type_info.min_val,
811  type_info.max_val,
812  sd_inner_to_outer_translation_map);
813  if (outer_id == StringDictionary::INVALID_STR_ID) {
814  continue;
815  }
816  elem = outer_id;
817  }
818 #endif
819  int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,
820  elem,
821  type_info.min_val,
822  shard_info.entry_count_per_shard,
823  shard_info.num_shards,
824  shard_info.device_count);
825  mapd_add(entry_ptr, int32_t(1));
826  }
827 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const int64_t null_val
const size_t entry_count_per_shard
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:96
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const FILL_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 584 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, and omnisci.dtypes::T.

593  {
594 #ifdef __CUDACC__
595  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
596  int32_t step = blockDim.x * gridDim.x;
597 #else
598  int32_t start = cpu_thread_idx;
599  int32_t step = cpu_thread_count;
600 #endif
601 
602  T key_scratch_buff[g_maximum_conditions_to_coalesce];
603  const size_t key_size_in_bytes = key_component_count * sizeof(T);
604  const size_t hash_entry_size =
605  (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
606  auto key_buff_handler = [hash_buff,
607  entry_count,
608  with_val_slot,
609  invalid_slot_val,
610  key_size_in_bytes,
611  hash_entry_size,
612  &for_semi_join](const int64_t entry_idx,
613  const T* key_scratch_buffer,
614  const size_t key_component_count) {
615  if (for_semi_join) {
616  return write_baseline_hash_slot_for_semi_join<T>(entry_idx,
617  hash_buff,
618  entry_count,
619  key_scratch_buffer,
620  key_component_count,
621  with_val_slot,
622  invalid_slot_val,
623  key_size_in_bytes,
624  hash_entry_size);
625  } else {
626  return write_baseline_hash_slot<T>(entry_idx,
627  hash_buff,
628  entry_count,
629  key_scratch_buffer,
630  key_component_count,
631  with_val_slot,
632  invalid_slot_val,
633  key_size_in_bytes,
634  hash_entry_size);
635  }
636  };
637 
638  JoinColumnTuple cols(
639  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
640  for (auto& it : cols.slice(start, step)) {
641  const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
642  if (err) {
643  return err;
644  }
645  }
646  return 0;
647 }
char * f
const size_t g_maximum_conditions_to_coalesce
int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1778 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1787  {
1788  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1789  entry_count,
1790  invalid_slot_val,
1791  for_semi_join,
1792  key_component_count,
1793  with_val_slot,
1794  key_handler,
1795  num_elems,
1796  cpu_thread_idx,
1797  cpu_thread_count);
1798 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1842 of file HashJoinRuntime.cpp.

1851  {
1852  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1853  entry_count,
1854  invalid_slot_val,
1855  for_semi_join,
1856  key_component_count,
1857  with_val_slot,
1858  key_handler,
1859  num_elems,
1860  cpu_thread_idx,
1861  cpu_thread_count);
1862 }
DEVICE int SUFFIX() fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 230 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

238  {
239  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
240  : SUFFIX(fill_one_to_one_hashtable);
241  auto hashtable_filling_func = [&](auto elem, size_t index) {
242  auto entry_ptr = SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
243  return filling_func(index, entry_ptr, invalid_slot_val);
244  };
245 
246  return fill_hash_join_buff_impl(buff,
247  invalid_slot_val,
248  join_column,
249  type_info,
250  sd_inner_to_outer_translation_map,
251  min_inner_elem,
252  cpu_thread_idx,
253  cpu_thread_count,
254  hashtable_filling_func);
255 }
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:55
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:75
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:45

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 200 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

210  {
211  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
212  : SUFFIX(fill_one_to_one_hashtable);
213  auto hashtable_filling_func = [&](auto elem, size_t index) {
214  auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(
215  buff, elem, type_info.min_val, bucket_normalization);
216  return filling_func(index, entry_ptr, invalid_slot_val);
217  };
218 
219  return fill_hash_join_buff_impl(buff,
220  invalid_slot_val,
221  join_column,
222  type_info,
223  sd_inner_to_outer_translation_map,
224  min_inner_elem,
225  cpu_thread_idx,
226  cpu_thread_count,
227  hashtable_filling_func);
228 }
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:67
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:55
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:45

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 152 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff(), and fill_hash_join_buff_bucketized().

160  {
161 #ifdef __CUDACC__
162  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
163  int32_t step = blockDim.x * gridDim.x;
164 #else
165  int32_t start = cpu_thread_idx;
166  int32_t step = cpu_thread_count;
167 #endif
168  JoinColumnTyped col{&join_column, &type_info};
169  for (auto item : col.slice(start, step)) {
170  const size_t index = item.index;
171  int64_t elem = item.element;
172  if (elem == type_info.null_val) {
173  if (type_info.uses_bw_eq) {
174  elem = type_info.translated_null_val;
175  } else {
176  continue;
177  }
178  }
179 #ifndef __CUDACC__
180  if (sd_inner_to_outer_translation_map &&
181  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
182  const auto outer_id = map_str_id_to_outer_dict(elem,
183  min_inner_elem,
184  type_info.min_val,
185  type_info.max_val,
186  sd_inner_to_outer_translation_map);
187  if (outer_id == StringDictionary::INVALID_STR_ID) {
188  continue;
189  }
190  elem = outer_id;
191  }
192 #endif
193  if (filling_func(elem, index)) {
194  return -1;
195  }
196  }
197  return 0;
198 };
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 351 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded().

361  {
362  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
363  : SUFFIX(fill_one_to_one_hashtable);
364  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
365  auto entry_ptr = SUFFIX(get_hash_slot_sharded_opt)(buff,
366  elem,
367  type_info.min_val,
368  shard_info.entry_count_per_shard,
369  shard,
370  shard_info.num_shards,
371  shard_info.device_count);
372  return filling_func(index, entry_ptr, invalid_slot_val);
373  };
374 
376  invalid_slot_val,
377  join_column,
378  type_info,
379  shard_info,
380  sd_inner_to_outer_translation_map,
381  min_inner_elem,
382  cpu_thread_idx,
383  cpu_thread_count,
384  hashtable_filling_func);
385 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:55
const size_t entry_count_per_shard
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:125
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:45

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 312 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded_bucketized().

323  {
324  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
325  : SUFFIX(fill_one_to_one_hashtable);
326  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
327  auto entry_ptr =
329  elem,
330  type_info.min_val,
331  shard_info.entry_count_per_shard,
332  shard,
333  shard_info.num_shards,
334  shard_info.device_count,
335  bucket_normalization);
336  return filling_func(index, entry_ptr, invalid_slot_val);
337  };
338 
340  invalid_slot_val,
341  join_column,
342  type_info,
343  shard_info,
344  sd_inner_to_outer_translation_map,
345  min_inner_elem,
346  cpu_thread_idx,
347  cpu_thread_count,
348  hashtable_filling_func);
349 }
const size_t num_shards
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:110
#define SUFFIX(name)
const int device_count
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:55
const size_t entry_count_per_shard
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:45

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 258 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, ShardInfo::num_shards, ShardInfo::shard, SHARD_FOR_KEY, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_sharded(), and fill_hash_join_buff_sharded_bucketized().

268  {
269 #ifdef __CUDACC__
270  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
271  int32_t step = blockDim.x * gridDim.x;
272 #else
273  int32_t start = cpu_thread_idx;
274  int32_t step = cpu_thread_count;
275 #endif
276  JoinColumnTyped col{&join_column, &type_info};
277  for (auto item : col.slice(start, step)) {
278  const size_t index = item.index;
279  int64_t elem = item.element;
280  size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);
281  if (shard != shard_info.shard) {
282  continue;
283  }
284  if (elem == type_info.null_val) {
285  if (type_info.uses_bw_eq) {
286  elem = type_info.translated_null_val;
287  } else {
288  continue;
289  }
290  }
291 #ifndef __CUDACC__
292  if (sd_inner_to_outer_translation_map &&
293  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
294  const auto outer_id = map_str_id_to_outer_dict(elem,
295  min_inner_elem,
296  type_info.min_val,
297  type_info.max_val,
298  sd_inner_to_outer_translation_map);
299  if (outer_id == StringDictionary::INVALID_STR_ID) {
300  continue;
301  }
302  elem = outer_id;
303  }
304 #endif
305  if (filling_func(elem, shard, index)) {
306  return -1;
307  }
308  }
309  return 0;
310 }
const size_t num_shards
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const size_t shard
const int64_t max_val
const int64_t min_val
#define SHARD_FOR_KEY(key, num_shards)
Definition: shard_key.h:20

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
void fill_one_to_many_baseline_hash_table ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const size_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 1907 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, count_matches_baseline(), fill_row_ids_baseline(), i, inclusive_scan(), and SUFFIX.

1920  {
1921  int32_t* pos_buff = buff;
1922  int32_t* count_buff = buff + hash_entry_count;
1923  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1924  std::vector<std::future<void>> counter_threads;
1925  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1926  if (is_range_join) {
1927  counter_threads.push_back(std::async(
1929  [count_buff,
1930  composite_key_dict,
1931  &hash_entry_count,
1932  &join_buckets_per_key,
1933  &join_column_per_key,
1934  &is_geo_compressed,
1935  cpu_thread_idx,
1936  cpu_thread_count] {
1937  const auto key_handler = RangeKeyHandler(
1938  is_geo_compressed,
1939  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1940  &join_column_per_key[0],
1941  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1942  count_matches_baseline(count_buff,
1943  composite_key_dict,
1944  hash_entry_count,
1945  &key_handler,
1946  join_column_per_key[0].num_elems,
1947  cpu_thread_idx,
1948  cpu_thread_count);
1949  }));
1950  } else if (join_buckets_per_key.size() > 0) {
1951  counter_threads.push_back(std::async(
1953  [count_buff,
1954  composite_key_dict,
1955  &hash_entry_count,
1956  &join_buckets_per_key,
1957  &join_column_per_key,
1958  cpu_thread_idx,
1959  cpu_thread_count] {
1960  const auto key_handler = OverlapsKeyHandler(
1961  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1962  &join_column_per_key[0],
1963  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1964  count_matches_baseline(count_buff,
1965  composite_key_dict,
1966  hash_entry_count,
1967  &key_handler,
1968  join_column_per_key[0].num_elems,
1969  cpu_thread_idx,
1970  cpu_thread_count);
1971  }));
1972  } else {
1973  counter_threads.push_back(std::async(
1975  [count_buff,
1976  composite_key_dict,
1977  &key_component_count,
1978  &hash_entry_count,
1979  &join_column_per_key,
1980  &type_info_per_key,
1981  &sd_inner_proxy_per_key,
1982  &sd_outer_proxy_per_key,
1983  cpu_thread_idx,
1984  cpu_thread_count] {
1985  const auto key_handler = GenericKeyHandler(key_component_count,
1986  true,
1987  &join_column_per_key[0],
1988  &type_info_per_key[0],
1989  &sd_inner_proxy_per_key[0],
1990  &sd_outer_proxy_per_key[0]);
1991  count_matches_baseline(count_buff,
1992  composite_key_dict,
1993  hash_entry_count,
1994  &key_handler,
1995  join_column_per_key[0].num_elems,
1996  cpu_thread_idx,
1997  cpu_thread_count);
1998  }));
1999  }
2000  }
2001 
2002  for (auto& child : counter_threads) {
2003  child.get();
2004  }
2005 
2006  std::vector<int32_t> count_copy(hash_entry_count, 0);
2007  CHECK_GT(hash_entry_count, int64_t(0));
2008  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
2010  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
2011  std::vector<std::future<void>> pos_threads;
2012  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
2013  pos_threads.push_back(std::async(
2015  [&](const int thread_idx) {
2016  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
2017  if (count_buff[i]) {
2018  pos_buff[i] = count_copy[i];
2019  }
2020  }
2021  },
2022  cpu_thread_idx));
2023  }
2024  for (auto& child : pos_threads) {
2025  child.get();
2026  }
2027 
2028  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
2029  std::vector<std::future<void>> rowid_threads;
2030  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
2031  if (is_range_join) {
2032  rowid_threads.push_back(std::async(
2034  [buff,
2035  composite_key_dict,
2036  hash_entry_count,
2037  invalid_slot_val,
2038  &join_column_per_key,
2039  &join_buckets_per_key,
2040  &is_geo_compressed,
2041  cpu_thread_idx,
2042  cpu_thread_count] {
2043  const auto key_handler = RangeKeyHandler(
2044  is_geo_compressed,
2045  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2046  &join_column_per_key[0],
2047  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2049  (buff,
2050  composite_key_dict,
2051  hash_entry_count,
2052  invalid_slot_val,
2053  &key_handler,
2054  join_column_per_key[0].num_elems,
2055  cpu_thread_idx,
2056  cpu_thread_count);
2057  }));
2058  } else if (join_buckets_per_key.size() > 0) {
2059  rowid_threads.push_back(std::async(
2061  [buff,
2062  composite_key_dict,
2063  hash_entry_count,
2064  invalid_slot_val,
2065  &join_column_per_key,
2066  &join_buckets_per_key,
2067  cpu_thread_idx,
2068  cpu_thread_count] {
2069  const auto key_handler = OverlapsKeyHandler(
2070  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2071  &join_column_per_key[0],
2072  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2074  (buff,
2075  composite_key_dict,
2076  hash_entry_count,
2077  invalid_slot_val,
2078  &key_handler,
2079  join_column_per_key[0].num_elems,
2080  cpu_thread_idx,
2081  cpu_thread_count);
2082  }));
2083  } else {
2084  rowid_threads.push_back(std::async(std::launch::async,
2085  [buff,
2086  composite_key_dict,
2087  hash_entry_count,
2088  invalid_slot_val,
2089  key_component_count,
2090  &join_column_per_key,
2091  &type_info_per_key,
2092  &sd_inner_proxy_per_key,
2093  &sd_outer_proxy_per_key,
2094  cpu_thread_idx,
2095  cpu_thread_count] {
2096  const auto key_handler = GenericKeyHandler(
2097  key_component_count,
2098  true,
2099  &join_column_per_key[0],
2100  &type_info_per_key[0],
2101  &sd_inner_proxy_per_key[0],
2102  &sd_outer_proxy_per_key[0]);
2104  (buff,
2105  composite_key_dict,
2106  hash_entry_count,
2107  invalid_slot_val,
2108  &key_handler,
2109  join_column_per_key[0].num_elems,
2110  cpu_thread_idx,
2111  cpu_thread_count);
2112  }));
2113  }
2114  }
2115 
2116  for (auto& child : rowid_threads) {
2117  child.get();
2118  }
2119 }
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_baseline(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:223
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() count_matches_baseline(int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 2121 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2134  {
2135  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2136  composite_key_dict,
2137  hash_entry_count,
2138  invalid_slot_val,
2139  key_component_count,
2140  join_column_per_key,
2141  type_info_per_key,
2142  join_bucket_info,
2143  sd_inner_proxy_per_key,
2144  sd_outer_proxy_per_key,
2145  cpu_thread_count,
2146  is_range_join,
2147  is_geo_compressed);
2148 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 2150 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2163  {
2164  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2165  composite_key_dict,
2166  hash_entry_count,
2167  invalid_slot_val,
2168  key_component_count,
2169  join_column_per_key,
2170  type_info_per_key,
2171  join_bucket_info,
2172  sd_inner_proxy_per_key,
2173  sd_outer_proxy_per_key,
2174  cpu_thread_count,
2175  is_range_join,
2176  is_geo_compressed);
2177 }

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1483 of file HashJoinRuntime.cpp.

References count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1490  {
1491  auto timer = DEBUG_TIMER(__func__);
1492  auto launch_count_matches = [count_buff = buff + hash_entry_info.hash_entry_count,
1493  invalid_slot_val,
1494  &join_column,
1495  &type_info,
1496  sd_inner_to_outer_translation_map,
1497  min_inner_elem](auto cpu_thread_idx,
1498  auto cpu_thread_count) {
1500  (count_buff,
1501  invalid_slot_val,
1502  join_column,
1503  type_info,
1504  sd_inner_to_outer_translation_map,
1505  min_inner_elem,
1506  cpu_thread_idx,
1507  cpu_thread_count);
1508  };
1509  auto launch_fill_row_ids = [hash_entry_count = hash_entry_info.hash_entry_count,
1510  buff,
1511  invalid_slot_val,
1512  &join_column,
1513  &type_info,
1514  sd_inner_to_outer_translation_map,
1515  min_inner_elem](auto cpu_thread_idx,
1516  auto cpu_thread_count) {
1518  (buff,
1519  hash_entry_count,
1520  invalid_slot_val,
1521  join_column,
1522  type_info,
1523  sd_inner_to_outer_translation_map,
1524  min_inner_elem,
1525  cpu_thread_idx,
1526  cpu_thread_count);
1527  };
1528 
1530  hash_entry_info.hash_entry_count,
1531  invalid_slot_val,
1532  join_column,
1533  type_info,
1534  sd_inner_to_outer_translation_map,
1535  min_inner_elem,
1536  cpu_thread_count,
1537  launch_count_matches,
1538  launch_fill_row_ids);
1539 }
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
size_t hash_entry_count
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define DEBUG_TIMER(name)
Definition: Logger.h:358

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1541 of file HashJoinRuntime.cpp.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1549  {
1550  auto timer = DEBUG_TIMER(__func__);
1551  auto bucket_normalization = hash_entry_info.bucket_normalization;
1552  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1553  auto launch_count_matches = [bucket_normalization,
1554  count_buff = buff + hash_entry_count,
1555  invalid_slot_val,
1556  &join_column,
1557  &type_info,
1558  sd_inner_to_outer_translation_map,
1559  min_inner_elem](auto cpu_thread_idx,
1560  auto cpu_thread_count) {
1562  (count_buff,
1563  invalid_slot_val,
1564  join_column,
1565  type_info,
1566  sd_inner_to_outer_translation_map,
1567  min_inner_elem,
1568  cpu_thread_idx,
1569  cpu_thread_count,
1570  bucket_normalization);
1571  };
1572  auto launch_fill_row_ids = [bucket_normalization,
1573  hash_entry_count,
1574  buff,
1575  invalid_slot_val,
1576  &join_column,
1577  &type_info,
1578  sd_inner_to_outer_translation_map,
1579  min_inner_elem](auto cpu_thread_idx,
1580  auto cpu_thread_count) {
1582  (buff,
1583  hash_entry_count,
1584  invalid_slot_val,
1585  join_column,
1586  type_info,
1587  sd_inner_to_outer_translation_map,
1588  min_inner_elem,
1589  cpu_thread_idx,
1590  cpu_thread_count,
1591  bucket_normalization);
1592  };
1593 
1595  hash_entry_count,
1596  invalid_slot_val,
1597  join_column,
1598  type_info,
1599  sd_inner_to_outer_translation_map,
1600  min_inner_elem,
1601  cpu_thread_count,
1602  launch_count_matches,
1603  launch_fill_row_ids);
1604 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
int64_t bucket_normalization
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const
#define DEBUG_TIMER(name)
Definition: Logger.h:358

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_func,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_func 
)

Definition at line 1421 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, i, and inclusive_scan().

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_bucketized().

1430  {
1431  auto timer = DEBUG_TIMER(__func__);
1432  int32_t* pos_buff = buff;
1433  int32_t* count_buff = buff + hash_entry_count;
1434  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1435  std::vector<std::future<void>> counter_threads;
1436  for (unsigned cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1437  counter_threads.push_back(std::async(
1438  std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));
1439  }
1440 
1441  for (auto& child : counter_threads) {
1442  child.get();
1443  }
1444 
1445  std::vector<int32_t> count_copy(hash_entry_count, 0);
1446  CHECK_GT(hash_entry_count, int64_t(0));
1447  memcpy(count_copy.data() + 1, count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1448 #if HAVE_CUDA
1449  thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());
1450 #else
1452  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1453 #endif
1454  std::vector<std::future<void>> pos_threads;
1455  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1456  pos_threads.push_back(std::async(
1458  [&](size_t thread_idx) {
1459  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1460  if (count_buff[i]) {
1461  pos_buff[i] = count_copy[i];
1462  }
1463  }
1464  },
1465  cpu_thread_idx));
1466  }
1467  for (auto& child : pos_threads) {
1468  child.get();
1469  }
1470 
1471  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1472  std::vector<std::future<void>> rowid_threads;
1473  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1474  rowid_threads.push_back(std::async(
1475  std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));
1476  }
1477 
1478  for (auto& child : rowid_threads) {
1479  child.get();
1480  }
1481 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:223
future< Result > async(Fn &&fn, Args &&...args)
#define DEBUG_TIMER(name)
Definition: Logger.h:358

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1667 of file HashJoinRuntime.cpp.

References count_matches_sharded(), fill_one_to_many_hash_table_sharded_impl(), fill_row_ids_sharded(), and SUFFIX.

1675  {
1676  auto launch_count_matches = [count_buff = buff + hash_entry_count,
1677  invalid_slot_val,
1678  &join_column,
1679  &type_info,
1680  &shard_info
1681 #ifndef __CUDACC__
1682  ,
1683  sd_inner_to_outer_translation_map,
1684  min_inner_elem
1685 #endif
1686  ](auto cpu_thread_idx, auto cpu_thread_count) {
1687  return SUFFIX(count_matches_sharded)(count_buff,
1688  invalid_slot_val,
1689  join_column,
1690  type_info,
1691  shard_info
1692 #ifndef __CUDACC__
1693  ,
1694  sd_inner_to_outer_translation_map,
1695  min_inner_elem,
1696  cpu_thread_idx,
1697  cpu_thread_count
1698 #endif
1699  );
1700  };
1701 
1702  auto launch_fill_row_ids = [buff,
1703  hash_entry_count,
1704  invalid_slot_val,
1705  &join_column,
1706  &type_info,
1707  &shard_info
1708 #ifndef __CUDACC__
1709  ,
1710  sd_inner_to_outer_translation_map,
1711  min_inner_elem
1712 #endif
1713  ](auto cpu_thread_idx, auto cpu_thread_count) {
1714  return SUFFIX(fill_row_ids_sharded)(buff,
1715  hash_entry_count,
1716  invalid_slot_val,
1717  join_column,
1718  type_info,
1719  shard_info
1720 #ifndef __CUDACC__
1721  ,
1722  sd_inner_to_outer_translation_map,
1723  min_inner_elem,
1724  cpu_thread_idx,
1725  cpu_thread_count);
1726 #endif
1727  };
1728 
1730  hash_entry_count,
1731  invalid_slot_val,
1732  join_column,
1733  type_info,
1734  shard_info
1735 #ifndef __CUDACC__
1736  ,
1737  sd_inner_to_outer_translation_map,
1738  min_inner_elem,
1739  cpu_thread_count
1740 #endif
1741  ,
1742  launch_count_matches,
1743  launch_fill_row_ids);
1744 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_launcher,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_launcher 
)

Definition at line 1607 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, i, and inclusive_scan().

Referenced by fill_one_to_many_hash_table_sharded().

1618  {
1619  auto timer = DEBUG_TIMER(__func__);
1620  int32_t* pos_buff = buff;
1621  int32_t* count_buff = buff + hash_entry_count;
1622  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1623  std::vector<std::future<void>> counter_threads;
1624  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1625  counter_threads.push_back(std::async(
1626  std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));
1627  }
1628 
1629  for (auto& child : counter_threads) {
1630  child.get();
1631  }
1632 
1633  std::vector<int32_t> count_copy(hash_entry_count, 0);
1634  CHECK_GT(hash_entry_count, int64_t(0));
1635  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1637  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1638  std::vector<std::future<void>> pos_threads;
1639  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1640  pos_threads.push_back(std::async(
1642  [&](const unsigned thread_idx) {
1643  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1644  if (count_buff[i]) {
1645  pos_buff[i] = count_copy[i];
1646  }
1647  }
1648  },
1649  cpu_thread_idx));
1650  }
1651  for (auto& child : pos_threads) {
1652  child.get();
1653  }
1654 
1655  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1656  std::vector<std::future<void>> rowid_threads;
1657  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1658  rowid_threads.push_back(std::async(
1659  std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));
1660  }
1661 
1662  for (auto& child : rowid_threads) {
1663  child.get();
1664  }
1665 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:223
future< Result > async(Fn &&fn, Args &&...args)
#define DEBUG_TIMER(name)
Definition: Logger.h:358

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 963 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

975  {
976  auto slot_sel = [&type_info](auto pos_buff, auto elem) {
977  return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);
978  };
979 
980  fill_row_ids_impl(buff,
981  hash_entry_count,
982  invalid_slot_val,
983  join_column,
984  type_info
985 #ifndef __CUDACC__
986  ,
987  sd_inner_to_outer_translation_map,
988  min_inner_elem,
989  cpu_thread_idx,
990  cpu_thread_count
991 #endif
992  ,
993  slot_sel);
994 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:75
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1174 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and omnisci.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

1185  {
1186  int32_t* pos_buff = buff;
1187  int32_t* count_buff = buff + hash_entry_count;
1188  int32_t* id_buff = count_buff + hash_entry_count;
1189 #ifdef __CUDACC__
1190  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1191  int32_t step = blockDim.x * gridDim.x;
1192 #else
1193  int32_t start = cpu_thread_idx;
1194  int32_t step = cpu_thread_count;
1195 #endif
1196 
1197  T key_scratch_buff[g_maximum_conditions_to_coalesce];
1198 #ifdef __CUDACC__
1199  assert(composite_key_dict);
1200 #endif
1201  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
1202  auto key_buff_handler = [composite_key_dict,
1203  hash_entry_count,
1204  pos_buff,
1205  invalid_slot_val,
1206  count_buff,
1207  id_buff,
1208  key_size_in_bytes](const int64_t row_index,
1209  const T* key_scratch_buff,
1210  const size_t key_component_count) {
1211  const T* matching_group =
1213  key_component_count,
1214  composite_key_dict,
1215  hash_entry_count,
1216  key_size_in_bytes);
1217  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
1218  int32_t* pos_ptr = pos_buff + entry_idx;
1219  const auto bin_idx = pos_ptr - pos_buff;
1220  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1221  id_buff[id_buff_idx] = static_cast<int32_t>(row_index);
1222  return 0;
1223  };
1224 
1225  JoinColumnTuple cols(
1226  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1227  for (auto& it : cols.slice(start, step)) {
1228  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1229  }
1230  return;
1231 }
#define SUFFIX(name)
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
char * f
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 996 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

1010  {
1011  auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {
1013  pos_buff, elem, type_info.min_val, bucket_normalization);
1014  };
1015  fill_row_ids_impl(buff,
1016  hash_entry_count,
1017  invalid_slot_val,
1018  join_column,
1019  type_info
1020 #ifndef __CUDACC__
1021  ,
1022  sd_inner_to_outer_translation_map,
1023  min_inner_elem,
1024  cpu_thread_idx,
1025  cpu_thread_count
1026 #endif
1027  ,
1028  slot_sel);
1029 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:67
#define SUFFIX(name)
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 906 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by fill_row_ids(), fill_row_ids_bucketized(), fill_row_ids_sharded(), and fill_row_ids_sharded_bucketized().

919  {
920  int32_t* pos_buff = buff;
921  int32_t* count_buff = buff + hash_entry_count;
922  int32_t* id_buff = count_buff + hash_entry_count;
923 
924 #ifdef __CUDACC__
925  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
926  int32_t step = blockDim.x * gridDim.x;
927 #else
928  int32_t start = cpu_thread_idx;
929  int32_t step = cpu_thread_count;
930 #endif
931  JoinColumnTyped col{&join_column, &type_info};
932  for (auto item : col.slice(start, step)) {
933  const size_t index = item.index;
934  int64_t elem = item.element;
935  if (elem == type_info.null_val) {
936  if (type_info.uses_bw_eq) {
937  elem = type_info.translated_null_val;
938  } else {
939  continue;
940  }
941  }
942 #ifndef __CUDACC__
943  if (sd_inner_to_outer_translation_map &&
944  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
945  const auto outer_id = map_str_id_to_outer_dict(elem,
946  min_inner_elem,
947  type_info.min_val,
948  type_info.max_val,
949  sd_inner_to_outer_translation_map);
950  if (outer_id == StringDictionary::INVALID_STR_ID) {
951  continue;
952  }
953  elem = outer_id;
954  }
955 #endif
956  auto pos_ptr = slot_selector(pos_buff, elem);
957  const auto bin_idx = pos_ptr - pos_buff;
958  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
959  id_buff[id_buff_idx] = static_cast<int32_t>(index);
960  }
961 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1091 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot_sharded(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

1104  {
1105  auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {
1106  return SUFFIX(get_hash_slot_sharded)(pos_buff,
1107  elem,
1108  type_info.min_val,
1109  shard_info.entry_count_per_shard,
1110  shard_info.num_shards,
1111  shard_info.device_count);
1112  };
1113 
1114  fill_row_ids_impl(buff,
1115  hash_entry_count,
1116  invalid_slot_val,
1117  join_column,
1118  type_info
1119 #ifndef __CUDACC__
1120  ,
1121  sd_inner_to_outer_translation_map,
1122  min_inner_elem,
1123  cpu_thread_idx,
1124  cpu_thread_count
1125 #endif
1126  ,
1127  slot_sel);
1128 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:96

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 1130 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot_sharded(), and SUFFIX.

1145  {
1146  auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,
1147  auto elem) {
1148  return SUFFIX(get_bucketized_hash_slot_sharded)(pos_buff,
1149  elem,
1150  type_info.min_val,
1151  shard_info.entry_count_per_shard,
1152  shard_info.num_shards,
1153  shard_info.device_count,
1154  bucket_normalization);
1155  };
1156 
1157  fill_row_ids_impl(buff,
1158  hash_entry_count,
1159  invalid_slot_val,
1160  join_column,
1161  type_info
1162 #ifndef __CUDACC__
1163  ,
1164  sd_inner_to_outer_translation_map,
1165  min_inner_elem,
1166  cpu_thread_idx,
1167  cpu_thread_count
1168 #endif
1169  ,
1170  slot_sel);
1171 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:81
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 1032 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

1046  {
1047 
1048  int32_t* pos_buff = buff;
1049  int32_t* count_buff = buff + hash_entry_count;
1050  int32_t* id_buff = count_buff + hash_entry_count;
1051 
1052 #ifdef __CUDACC__
1053  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1054  int32_t step = blockDim.x * gridDim.x;
1055 #else
1056  int32_t start = cpu_thread_idx;
1057  int32_t step = cpu_thread_count;
1058 #endif
1059  JoinColumnTyped col{&join_column, &type_info};
1060  for (auto item : col.slice(start, step)) {
1061  const size_t index = item.index;
1062  int64_t elem = item.element;
1063  if (elem == type_info.null_val) {
1064  if (type_info.uses_bw_eq) {
1065  elem = type_info.translated_null_val;
1066  } else {
1067  continue;
1068  }
1069  }
1070 #ifndef __CUDACC__
1071  if (sd_inner_to_outer_translation_map &&
1072  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
1073  const auto outer_id = map_str_id_to_outer_dict(elem,
1074  min_inner_elem,
1075  type_info.min_val,
1076  type_info.max_val,
1077  sd_inner_to_outer_translation_map);
1078  if (outer_id == StringDictionary::INVALID_STR_ID) {
1079  continue;
1080  }
1081  elem = outer_id;
1082  }
1083 #endif
1084  auto* pos_ptr = slot_selector(pos_buff, elem);
1085  const auto bin_idx = pos_ptr - pos_buff;
1086  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1087  id_buff[id_buff_idx] = static_cast<int32_t>(index);
1088  }
1089 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

template<typename T >
T* get_matching_baseline_hash_slot_at ( int8_t *  hash_buff,
const uint32_t  h,
const T *  key,
const size_t  key_component_count,
const int64_t  hash_entry_size 
)

Definition at line 474 of file HashJoinRuntime.cpp.

References cas_cst, get_invalid_key(), i, load_cst, store_cst, SUFFIX, omnisci.dtypes::T, and UNLIKELY.

Referenced by write_baseline_hash_slot(), and write_baseline_hash_slot_for_semi_join().

478  {
479  uint32_t off = h * hash_entry_size;
480  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
481  T empty_key = SUFFIX(get_invalid_key)<T>();
482  T write_pending = SUFFIX(get_invalid_key)<T>() - 1;
483  if (UNLIKELY(*key == write_pending)) {
484  // Address the singularity case where the first column contains the pending
485  // write special value. Should never happen, but avoid doing wrong things.
486  return nullptr;
487  }
488  const bool success = cas_cst(row_ptr, &empty_key, write_pending);
489  if (success) {
490  if (key_component_count > 1) {
491  memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));
492  }
493  store_cst(row_ptr, *key);
494  return reinterpret_cast<T*>(row_ptr + key_component_count);
495  }
496  while (load_cst(row_ptr) == write_pending) {
497  // spin until the winning thread has finished writing the entire key
498  }
499  for (size_t i = 0; i < key_component_count; ++i) {
500  if (load_cst(row_ptr + i) != key[i]) {
501  return nullptr;
502  }
503  }
504  return reinterpret_cast<T*>(row_ptr + key_component_count);
505 }
#define SUFFIX(name)
#define load_cst(ptr)
DEVICE T SUFFIX() get_invalid_key()
#define cas_cst(ptr, expected, desired)
#define UNLIKELY(x)
Definition: likely.h:25
#define store_cst(ptr, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE NEVER_INLINE const T* SUFFIX() get_matching_baseline_hash_slot_readonly ( const T *  key,
const size_t  key_component_count,
const T *  composite_key_dict,
const int64_t  entry_count,
const size_t  key_size_in_bytes 
)

Definition at line 830 of file HashJoinRuntime.cpp.

References CHECK, keys_are_equal(), and MurmurHash1Impl().

Referenced by count_matches_baseline(), and fill_row_ids_baseline().

835  {
836  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
837  uint32_t off = h * key_component_count;
838  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
839  return &composite_key_dict[off];
840  }
841  uint32_t h_probe = (h + 1) % entry_count;
842  while (h_probe != h) {
843  off = h_probe * key_component_count;
844  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
845  return &composite_key_dict[off];
846  }
847  h_probe = (h_probe + 1) % entry_count;
848  }
849 #ifndef __CUDACC__
850  CHECK(false);
851 #else
852  assert(false);
853 #endif
854  return nullptr;
855 }
bool keys_are_equal(const T *key1, const T *key2, const size_t key_component_count)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define CHECK(condition)
Definition: Logger.h:211

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename InputIterator , typename OutputIterator >
void inclusive_scan ( InputIterator  first,
InputIterator  last,
OutputIterator  out,
const size_t  thread_count 
)

Definition at line 1349 of file HashJoinRuntime.cpp.

References threading_serial::async(), and gpu_enabled::partial_sum().

Referenced by approximate_distinct_tuples_on_device_overlaps(), approximate_distinct_tuples_on_device_range(), approximate_distinct_tuples_overlaps(), approximate_distinct_tuples_range(), fill_one_to_many_baseline_hash_table(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_impl(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), fill_one_to_many_hash_table_sharded_impl(), and gpu_enabled::partial_sum().

1352  {
1353  using ElementType = typename InputIterator::value_type;
1354  using OffsetType = typename InputIterator::difference_type;
1355  const OffsetType elem_count = last - first;
1356  if (elem_count < 10000 || thread_count <= 1) {
1357  ElementType sum = 0;
1358  for (auto iter = first; iter != last; ++iter, ++out) {
1359  *out = sum += *iter;
1360  }
1361  return;
1362  }
1363 
1364  const OffsetType step = (elem_count + thread_count - 1) / thread_count;
1365  OffsetType start_off = 0;
1366  OffsetType end_off = std::min(step, elem_count);
1367  std::vector<ElementType> partial_sums(thread_count);
1368  std::vector<std::future<void>> counter_threads;
1369  for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,
1370  start_off = std::min(start_off + step, elem_count),
1371  end_off = std::min(start_off + step, elem_count)) {
1372  counter_threads.push_back(std::async(
1374  [first, out](
1375  ElementType& partial_sum, const OffsetType start, const OffsetType end) {
1376  ElementType sum = 0;
1377  for (auto in_iter = first + start, out_iter = out + start;
1378  in_iter != (first + end);
1379  ++in_iter, ++out_iter) {
1380  *out_iter = sum += *in_iter;
1381  }
1382  partial_sum = sum;
1383  },
1384  std::ref(partial_sums[thread_idx]),
1385  start_off,
1386  end_off));
1387  }
1388  for (auto& child : counter_threads) {
1389  child.get();
1390  }
1391 
1392  ElementType sum = 0;
1393  for (auto& s : partial_sums) {
1394  s += sum;
1395  sum = s;
1396  }
1397 
1398  counter_threads.clear();
1399  start_off = std::min(step, elem_count);
1400  end_off = std::min(start_off + step, elem_count);
1401  for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,
1402  start_off = std::min(start_off + step, elem_count),
1403  end_off = std::min(start_off + step, elem_count)) {
1404  counter_threads.push_back(std::async(
1406  [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {
1407  for (auto iter = out + start; iter != (out + end); ++iter) {
1408  *iter += prev_sum;
1409  }
1410  },
1411  partial_sums[thread_idx],
1412  start_off,
1413  end_off));
1414  }
1415  for (auto& child : counter_threads) {
1416  child.get();
1417  }
1418 }
future< Result > async(Fn &&fn, Args &&...args)
DEVICE void partial_sum(ARGS &&...args)
Definition: gpu_enabled.h:87

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 388 of file HashJoinRuntime.cpp.

References get_invalid_key(), i, SUFFIX, and omnisci.dtypes::T.

Referenced by init_baseline_hash_join_buff_wrapper().

394  {
395 #ifdef __CUDACC__
396  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
397  int32_t step = blockDim.x * gridDim.x;
398 #else
399  int32_t start = cpu_thread_idx;
400  int32_t step = cpu_thread_count;
401 #endif
402  auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
403  const T empty_key = SUFFIX(get_invalid_key)<T>();
404  for (int64_t h = start; h < entry_count; h += step) {
405  int64_t off = h * hash_entry_size;
406  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
407  for (size_t i = 0; i < key_component_count; ++i) {
408  row_ptr[i] = empty_key;
409  }
410  if (with_val_slot) {
411  row_ptr[key_component_count] = invalid_slot_val;
412  }
413  }
414 }
#define SUFFIX(name)
DEVICE T SUFFIX() get_invalid_key()

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1746 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1752  {
1753  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1754  entry_count,
1755  key_component_count,
1756  with_val_slot,
1757  invalid_slot_val,
1758  cpu_thread_idx,
1759  cpu_thread_count);
1760 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1762 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1768  {
1769  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1770  entry_count,
1771  key_component_count,
1772  with_val_slot,
1773  invalid_slot_val,
1774  cpu_thread_idx,
1775  cpu_thread_count);
1776 }

+ Here is the caller graph for this function:

DEVICE void SUFFIX() init_hash_join_buff ( int32_t *  groups_buffer,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 105 of file HashJoinRuntime.cpp.

References i.

Referenced by init_hash_join_buff_wrapper(), and BaselineJoinHashTableBuilder::initHashTableOnCpu().

109  {
110 #ifdef __CUDACC__
111  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
112  int32_t step = blockDim.x * gridDim.x;
113 #else
114  int32_t start = cpu_thread_idx;
115  int32_t step = cpu_thread_count;
116 #endif
117  for (int64_t i = start; i < hash_entry_count; i += step) {
118  groups_buffer[i] = invalid_slot_val;
119  }
120 }

+ Here is the caller graph for this function:

void SUFFIX() init_hash_join_buff_tbb ( int32_t *  groups_buffer,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 124 of file HashJoinRuntime.cpp.

References threading_serial::parallel_for().

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

126  {
127  tbb::parallel_for(tbb::blocked_range<int64_t>(0, hash_entry_count),
128  [=](const tbb::blocked_range<int64_t>& r) {
129  const auto start_idx = r.begin();
130  const auto end_idx = r.end();
131  for (auto entry_idx = start_idx; entry_idx != end_idx;
132  ++entry_idx) {
133  groups_buffer[entry_idx] = invalid_slot_val;
134  }
135  });
136 }
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1800 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1808  {
1809  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1810  entry_count,
1811  invalid_slot_val,
1812  false,
1813  key_component_count,
1814  with_val_slot,
1815  key_handler,
1816  num_elems,
1817  cpu_thread_idx,
1818  cpu_thread_count);
1819 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1864 of file HashJoinRuntime.cpp.

1872  {
1873  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1874  entry_count,
1875  invalid_slot_val,
1876  false,
1877  key_component_count,
1878  with_val_slot,
1879  key_handler,
1880  num_elems,
1881  cpu_thread_idx,
1882  cpu_thread_count);
1883 }
int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1821 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1829  {
1830  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1831  entry_count,
1832  invalid_slot_val,
1833  false,
1834  key_component_count,
1835  with_val_slot,
1836  key_handler,
1837  num_elems,
1838  cpu_thread_idx,
1839  cpu_thread_count);
1840 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1885 of file HashJoinRuntime.cpp.

1893  {
1894  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1895  entry_count,
1896  invalid_slot_val,
1897  false,
1898  key_component_count,
1899  with_val_slot,
1900  key_handler,
1901  num_elems,
1902  cpu_thread_idx,
1903  cpu_thread_count);
1904 }
template<typename T >
DEVICE int write_baseline_hash_slot ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 514 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and omnisci.dtypes::T.

522  {
523  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
524  T* matching_group = get_matching_baseline_hash_slot_at(
525  hash_buff, h, key, key_component_count, hash_entry_size);
526  if (!matching_group) {
527  uint32_t h_probe = (h + 1) % entry_count;
528  while (h_probe != h) {
529  matching_group = get_matching_baseline_hash_slot_at(
530  hash_buff, h_probe, key, key_component_count, hash_entry_size);
531  if (matching_group) {
532  break;
533  }
534  h_probe = (h_probe + 1) % entry_count;
535  }
536  }
537  if (!matching_group) {
538  return -2;
539  }
540  if (!with_val_slot) {
541  return 0;
542  }
543  if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {
544  return -1;
545  }
546  return 0;
547 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function:

template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 550 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and omnisci.dtypes::T.

558  {
559  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
560  T* matching_group = get_matching_baseline_hash_slot_at(
561  hash_buff, h, key, key_component_count, hash_entry_size);
562  if (!matching_group) {
563  uint32_t h_probe = (h + 1) % entry_count;
564  while (h_probe != h) {
565  matching_group = get_matching_baseline_hash_slot_at(
566  hash_buff, h_probe, key, key_component_count, hash_entry_size);
567  if (matching_group) {
568  break;
569  }
570  h_probe = (h_probe + 1) % entry_count;
571  }
572  }
573  if (!matching_group) {
574  return -2;
575  }
576  if (!with_val_slot) {
577  return 0;
578  }
579  mapd_cas(matching_group, invalid_slot_val, val);
580  return 0;
581 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function: