OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.cpp File Reference
+ Include dependency graph for HashJoinRuntime.cpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

 anonymous_namespace{HashJoinRuntime.cpp}
 

Macros

#define mapd_cas(address, compare, val)   __sync_val_compare_and_swap(address, compare, val)
 
#define cas_cst(ptr, expected, desired)
 
#define store_cst(ptr, val)   __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)
 
#define load_cst(ptr)   __atomic_load_n(ptr, __ATOMIC_SEQ_CST)
 
#define mapd_add(address, val)   __sync_fetch_and_add(address, val)
 

Functions

int64_t anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict (const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
 
DEVICE void SUFFIX() init_hash_join_buff (int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl (int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl (int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
DEVICE int SUFFIX() fill_hash_join_buff_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
T * get_matching_baseline_hash_slot_at (int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join (const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
 
template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() count_matches (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() count_matches_bucketized (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
GLOBAL void SUFFIX() count_matches_sharded (int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
DEVICE NEVER_INLINE const T
*SUFFIX() 
get_matching_baseline_hash_slot_readonly (const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline (int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_bucketized (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
 
GLOBAL void SUFFIX() fill_row_ids_sharded (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized (int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl (uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl (double *bucket_sizes_for_thread, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename InputIterator , typename OutputIterator >
void inclusive_scan (InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
 
void fill_one_to_many_hash_table_sharded (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
template<typename T >
void fill_one_to_many_baseline_hash_table (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const size_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 

Macro Definition Documentation

#define cas_cst (   ptr,
  expected,
  desired 
)
Value:
__atomic_compare_exchange_n( \
ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

Definition at line 460 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define load_cst (   ptr)    __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

Definition at line 464 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

#define mapd_add (   address,
  val 
)    __sync_fetch_and_add(address, val)
#define mapd_cas (   address,
  compare,
  val 
)    __sync_val_compare_and_swap(address, compare, val)
#define store_cst (   ptr,
  val 
)    __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

Definition at line 463 of file HashJoinRuntime.cpp.

Referenced by get_matching_baseline_hash_slot_at().

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2149 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2154  {
2155  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2156  CHECK(!join_column_per_key.empty());
2157 
2158  std::vector<std::future<void>> approx_distinct_threads;
2159  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2160  approx_distinct_threads.push_back(std::async(
2162  [&join_column_per_key,
2163  &type_info_per_key,
2164  b,
2165  hll_buffer_all_cpus,
2166  padded_size_bytes,
2167  thread_idx,
2168  thread_count] {
2169  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2170 
2171  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2172  false,
2173  &join_column_per_key[0],
2174  &type_info_per_key[0],
2175  nullptr,
2176  nullptr);
2178  nullptr,
2179  b,
2180  join_column_per_key[0].num_elems,
2181  &key_handler,
2182  thread_idx,
2183  thread_count);
2184  }));
2185  }
2186  for (auto& child : approx_distinct_threads) {
2187  child.get();
2188  }
2189 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename KEY_HANDLER >
GLOBAL void SUFFIX() approximate_distinct_tuples_impl ( uint8_t *  hll_buffer,
int32_t *  row_count_buffer,
const uint32_t  b,
const int64_t  num_elems,
const KEY_HANDLER *  f,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1212 of file HashJoinRuntime.cpp.

References atomicMax(), g_maximum_conditions_to_coalesce, get_rank(), and MurmurHash64AImpl().

Referenced by approximate_distinct_tuples(), approximate_distinct_tuples_overlaps(), and approximate_distinct_tuples_range().

1222  {
1223 #ifdef __CUDACC__
1224  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1225  int32_t step = blockDim.x * gridDim.x;
1226 #else
1227  int32_t start = cpu_thread_idx;
1228  int32_t step = cpu_thread_count;
1229 #endif
1230 
1231  auto key_buff_handler = [b, hll_buffer, row_count_buffer](
1232  const int64_t entry_idx,
1233  const int64_t* key_scratch_buff,
1234  const size_t key_component_count) {
1235  if (row_count_buffer) {
1236  row_count_buffer[entry_idx] += 1;
1237  }
1238 
1239  const uint64_t hash =
1240  MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);
1241  const uint32_t index = hash >> (64 - b);
1242  const auto rank = get_rank(hash << b, 64 - b);
1243 #ifdef __CUDACC__
1244  atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);
1245 #else
1246  hll_buffer[index] = std::max(hll_buffer[index], rank);
1247 #endif
1248 
1249  return 0;
1250  };
1251 
1252  int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];
1253 
1254  JoinColumnTuple cols(
1255  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1256  for (auto& it : cols.slice(start, step)) {
1257  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1258  }
1259 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
constexpr double f
Definition: Utm.h:31
__device__ double atomicMax(double *address, double val)
const size_t g_maximum_conditions_to_coalesce
FORCE_INLINE DEVICE uint64_t MurmurHash64AImpl(const void *key, int len, uint64_t seed)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2191 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

2199  {
2200  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2201  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2202  CHECK(!join_column_per_key.empty());
2203 
2204  std::vector<std::future<void>> approx_distinct_threads;
2205  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2206  approx_distinct_threads.push_back(std::async(
2208  [&join_column_per_key,
2209  &join_buckets_per_key,
2210  &row_counts,
2211  b,
2212  hll_buffer_all_cpus,
2213  padded_size_bytes,
2214  thread_idx,
2215  thread_count] {
2216  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2217 
2218  const auto key_handler = OverlapsKeyHandler(
2219  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2220  &join_column_per_key[0],
2221  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2223  row_counts.data(),
2224  b,
2225  join_column_per_key[0].num_elems,
2226  &key_handler,
2227  thread_idx,
2228  thread_count);
2229  }));
2230  }
2231  for (auto& child : approx_distinct_threads) {
2232  child.get();
2233  }
2234 
2236  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2237 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2239 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2248  {
2249  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2250  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2251  CHECK(!join_column_per_key.empty());
2252 
2253  std::vector<std::future<void>> approx_distinct_threads;
2254  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2255  approx_distinct_threads.push_back(std::async(
2257  [&join_column_per_key,
2258  &join_buckets_per_key,
2259  &row_counts,
2260  b,
2261  hll_buffer_all_cpus,
2262  padded_size_bytes,
2263  thread_idx,
2264  is_compressed,
2265  thread_count] {
2266  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2267 
2268  const auto key_handler = RangeKeyHandler(
2269  is_compressed,
2270  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2271  &join_column_per_key[0],
2272  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2274  row_counts.data(),
2275  b,
2276  join_column_per_key[0].num_elems,
2277  &key_handler,
2278  thread_idx,
2279  thread_count);
2280  }));
2281  }
2282  for (auto& child : approx_distinct_threads) {
2283  child.get();
2284  }
2285 
2287  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2288 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<size_t N>
GLOBAL void SUFFIX() compute_bucket_sizes_impl ( double *  bucket_sizes_for_thread,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1282 of file HashJoinRuntime.cpp.

References atomicMin(), fixed_width_double_decode_noinline(), anonymous_namespace{Utm.h}::N, JoinColumnIterator::ptr(), and SUFFIX.

1291  {
1292 #ifdef __CUDACC__
1293  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1294  int32_t step = blockDim.x * gridDim.x;
1295 #else
1296  int32_t start = cpu_thread_idx;
1297  int32_t step = cpu_thread_count;
1298 #endif
1299  JoinColumnIterator it(join_column, type_info, start, step);
1300  for (; it; ++it) {
1301  // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)
1302  double bounds[2 * N];
1303  for (size_t j = 0; j < 2 * N; j++) {
1304  bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);
1305  }
1306 
1307  for (size_t j = 0; j < N; j++) {
1308  const auto diff = bounds[j + N] - bounds[j];
1309 #ifdef __CUDACC__
1310  if (diff > bucket_size_thresholds[j]) {
1311  atomicMin(&bucket_sizes_for_thread[j], diff);
1312  }
1313 #else
1314  if (diff < bucket_size_thresholds[j] && diff > bucket_sizes_for_thread[j]) {
1315  bucket_sizes_for_thread[j] = diff;
1316  }
1317 #endif
1318  }
1319  }
1320 }
__device__ double atomicMin(double *address, double val)
#define SUFFIX(name)
Iterates over the rows of a JoinColumn across multiple fragments/chunks.
constexpr unsigned N
Definition: Utm.h:110
DEVICE NEVER_INLINE double SUFFIX() fixed_width_double_decode_noinline(const int8_t *byte_stream, const int64_t pos)
Definition: DecodersImpl.h:133

+ Here is the call graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2290 of file HashJoinRuntime.cpp.

References threading_serial::async().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

2294  {
2295  std::vector<std::vector<double>> bucket_sizes_for_threads;
2296  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2297  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2298  }
2299  std::vector<std::future<void>> threads;
2300  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2301  threads.push_back(std::async(std::launch::async,
2302  compute_bucket_sizes_impl<2>,
2303  bucket_sizes_for_threads[thread_idx].data(),
2304  &join_column,
2305  &type_info,
2306  bucket_size_thresholds.data(),
2307  thread_idx,
2308  thread_count));
2309  }
2310  for (auto& child : threads) {
2311  child.get();
2312  }
2313 
2314  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2315  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2316  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2317  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2318  }
2319  }
2320  }
2321 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 704 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

714  {
715  auto slot_sel = [&type_info](auto count_buff, auto elem) {
716  return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);
717  };
718  count_matches_impl(count_buff,
719  join_column,
720  type_info
721 #ifndef __CUDACC__
722  ,
723  sd_inner_to_outer_translation_map,
724  min_inner_elem,
725  cpu_thread_idx,
726  cpu_thread_count
727 #endif
728  ,
729  slot_sel);
730 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:74
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() count_matches_baseline ( int32_t *  count_buff,
const T *  composite_key_dict,
const int64_t  entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 846 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and heavydb.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

856  {
857 #ifdef __CUDACC__
858  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
859  int32_t step = blockDim.x * gridDim.x;
860 #else
861  int32_t start = cpu_thread_idx;
862  int32_t step = cpu_thread_count;
863 #endif
864 #ifdef __CUDACC__
865  assert(composite_key_dict);
866 #endif
867  T key_scratch_buff[g_maximum_conditions_to_coalesce];
868  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
869  auto key_buff_handler = [composite_key_dict,
870  entry_count,
871  count_buff,
872  key_size_in_bytes](const int64_t row_entry_idx,
873  const T* key_scratch_buff,
874  const size_t key_component_count) {
875  const auto matching_group =
877  key_component_count,
878  composite_key_dict,
879  entry_count,
880  key_size_in_bytes);
881  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
882  mapd_add(&count_buff[entry_idx], int32_t(1));
883  return 0;
884  };
885 
886  JoinColumnTuple cols(
887  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
888  for (auto& it : cols.slice(start, step)) {
889  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
890  }
891 }
#define SUFFIX(name)
constexpr double f
Definition: Utm.h:31
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_bucketized ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 732 of file HashJoinRuntime.cpp.

References count_matches_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

744  {
745  auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {
747  count_buff, elem, type_info.min_val, bucket_normalization);
748  };
749  count_matches_impl(count_buff,
750  join_column,
751  type_info
752 #ifndef __CUDACC__
753  ,
754  sd_inner_to_outer_translation_map,
755  min_inner_elem,
756  cpu_thread_idx,
757  cpu_thread_count
758 #endif
759  ,
760  slot_sel);
761 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
#define SUFFIX(name)
const int64_t min_val
DEVICE void count_matches_impl(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void count_matches_impl ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 656 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by count_matches(), and count_matches_bucketized().

667  {
668 #ifdef __CUDACC__
669  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
670  int32_t step = blockDim.x * gridDim.x;
671 #else
672  int32_t start = cpu_thread_idx;
673  int32_t step = cpu_thread_count;
674 #endif
675  JoinColumnTyped col{&join_column, &type_info};
676  for (auto item : col.slice(start, step)) {
677  int64_t elem = item.element;
678  if (elem == type_info.null_val) {
679  if (type_info.uses_bw_eq) {
680  elem = type_info.translated_null_val;
681  } else {
682  continue;
683  }
684  }
685 #ifndef __CUDACC__
686  if (sd_inner_to_outer_translation_map &&
687  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
688  const auto outer_id = map_str_id_to_outer_dict(elem,
689  min_inner_elem,
690  type_info.min_val,
691  type_info.max_val,
692  sd_inner_to_outer_translation_map);
693  if (outer_id == StringDictionary::INVALID_STR_ID) {
694  continue;
695  }
696  elem = outer_id;
697  }
698 #endif
699  auto* entry_ptr = slot_selector(count_buff, elem);
700  mapd_add(entry_ptr, int32_t(1));
701  }
702 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() count_matches_sharded ( int32_t *  count_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 763 of file HashJoinRuntime.cpp.

References get_hash_slot_sharded(), StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

775  {
776 #ifdef __CUDACC__
777  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
778  int32_t step = blockDim.x * gridDim.x;
779 #else
780  int32_t start = cpu_thread_idx;
781  int32_t step = cpu_thread_count;
782 #endif
783  JoinColumnTyped col{&join_column, &type_info};
784  for (auto item : col.slice(start, step)) {
785  int64_t elem = item.element;
786  if (elem == type_info.null_val) {
787  if (type_info.uses_bw_eq) {
788  elem = type_info.translated_null_val;
789  } else {
790  continue;
791  }
792  }
793 #ifndef __CUDACC__
794  if (sd_inner_to_outer_translation_map &&
795  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
796  const auto outer_id = map_str_id_to_outer_dict(elem,
797  min_inner_elem,
798  type_info.min_val,
799  type_info.max_val,
800  sd_inner_to_outer_translation_map);
801  if (outer_id == StringDictionary::INVALID_STR_ID) {
802  continue;
803  }
804  elem = outer_id;
805  }
806 #endif
807  int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,
808  elem,
809  type_info.min_val,
810  shard_info.entry_count_per_shard,
811  shard_info.num_shards,
812  shard_info.device_count);
813  mapd_add(entry_ptr, int32_t(1));
814  }
815 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
const int64_t null_val
const size_t entry_count_per_shard
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:95
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename FILL_HANDLER >
DEVICE int SUFFIX() fill_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const FILL_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 578 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, and heavydb.dtypes::T.

587  {
588 #ifdef __CUDACC__
589  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
590  int32_t step = blockDim.x * gridDim.x;
591 #else
592  int32_t start = cpu_thread_idx;
593  int32_t step = cpu_thread_count;
594 #endif
595 
596  T key_scratch_buff[g_maximum_conditions_to_coalesce];
597  const size_t key_size_in_bytes = key_component_count * sizeof(T);
598  const size_t hash_entry_size =
599  (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
600  auto key_buff_handler = [hash_buff,
601  entry_count,
602  with_val_slot,
603  invalid_slot_val,
604  key_size_in_bytes,
605  hash_entry_size,
606  &for_semi_join](const int64_t entry_idx,
607  const T* key_scratch_buffer,
608  const size_t key_component_count) {
609  if (for_semi_join) {
610  return write_baseline_hash_slot_for_semi_join<T>(entry_idx,
611  hash_buff,
612  entry_count,
613  key_scratch_buffer,
614  key_component_count,
615  with_val_slot,
616  invalid_slot_val,
617  key_size_in_bytes,
618  hash_entry_size);
619  } else {
620  return write_baseline_hash_slot<T>(entry_idx,
621  hash_buff,
622  entry_count,
623  key_scratch_buffer,
624  key_component_count,
625  with_val_slot,
626  invalid_slot_val,
627  key_size_in_bytes,
628  hash_entry_size);
629  }
630  };
631 
632  JoinColumnTuple cols(
633  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
634  for (auto& it : cols.slice(start, step)) {
635  const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
636  if (err) {
637  return err;
638  }
639  }
640  return 0;
641 }
constexpr double f
Definition: Utm.h:31
const size_t g_maximum_conditions_to_coalesce
int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1758 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1767  {
1768  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1769  entry_count,
1770  invalid_slot_val,
1771  for_semi_join,
1772  key_component_count,
1773  with_val_slot,
1774  key_handler,
1775  num_elems,
1776  cpu_thread_idx,
1777  cpu_thread_count);
1778 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1822 of file HashJoinRuntime.cpp.

1831  {
1832  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1833  entry_count,
1834  invalid_slot_val,
1835  for_semi_join,
1836  key_component_count,
1837  with_val_slot,
1838  key_handler,
1839  num_elems,
1840  cpu_thread_idx,
1841  cpu_thread_count);
1842 }
DEVICE int SUFFIX() fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 195 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

203  {
204  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
205  : SUFFIX(fill_one_to_one_hashtable);
206  auto hashtable_filling_func = [&](auto elem, size_t index) {
207  auto entry_ptr = SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
208  return filling_func(index, entry_ptr, invalid_slot_val);
209  };
210 
211  return fill_hash_join_buff_impl(buff,
212  join_column,
213  type_info,
214  sd_inner_to_outer_translation_map,
215  min_inner_elem,
216  cpu_thread_idx,
217  cpu_thread_count,
218  hashtable_filling_func);
219 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:74
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 166 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

176  {
177  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
178  : SUFFIX(fill_one_to_one_hashtable);
179  auto hashtable_filling_func = [&](auto elem, size_t index) {
180  auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(
181  buff, elem, type_info.min_val, bucket_normalization);
182  return filling_func(index, entry_ptr, invalid_slot_val);
183  };
184 
185  return fill_hash_join_buff_impl(buff,
186  join_column,
187  type_info,
188  sd_inner_to_outer_translation_map,
189  min_inner_elem,
190  cpu_thread_idx,
191  cpu_thread_count,
192  hashtable_filling_func);
193 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE auto fill_hash_join_buff_impl ( int32_t *  buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 119 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff(), and fill_hash_join_buff_bucketized().

126  {
127 #ifdef __CUDACC__
128  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
129  int32_t step = blockDim.x * gridDim.x;
130 #else
131  int32_t start = cpu_thread_idx;
132  int32_t step = cpu_thread_count;
133 #endif
134  JoinColumnTyped col{&join_column, &type_info};
135  for (auto item : col.slice(start, step)) {
136  const size_t index = item.index;
137  int64_t elem = item.element;
138  if (elem == type_info.null_val) {
139  if (type_info.uses_bw_eq) {
140  elem = type_info.translated_null_val;
141  } else {
142  continue;
143  }
144  }
145 #ifndef __CUDACC__
146  if (sd_inner_to_outer_translation_map &&
147  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
148  const auto outer_id = map_str_id_to_outer_dict(elem,
149  min_inner_elem,
150  type_info.min_val,
151  type_info.max_val,
152  sd_inner_to_outer_translation_map);
153  if (outer_id == StringDictionary::INVALID_STR_ID) {
154  continue;
155  }
156  elem = outer_id;
157  }
158 #endif
159  if (filling_func(elem, index)) {
160  return -1;
161  }
162  }
163  return 0;
164 };
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 313 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded().

323  {
324  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
325  : SUFFIX(fill_one_to_one_hashtable);
326  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
327  auto entry_ptr = SUFFIX(get_hash_slot_sharded_opt)(buff,
328  elem,
329  type_info.min_val,
330  shard_info.entry_count_per_shard,
331  shard,
332  shard_info.num_shards,
333  shard_info.device_count);
334  return filling_func(index, entry_ptr, invalid_slot_val);
335  };
336 
338  join_column,
339  type_info,
340  shard_info,
341  sd_inner_to_outer_translation_map,
342  min_inner_elem,
343  cpu_thread_idx,
344  cpu_thread_count,
345  hashtable_filling_func);
346 }
const size_t num_shards
#define SUFFIX(name)
const int device_count
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
const size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:124
const int64_t min_val
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 275 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_sharded_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot_sharded_opt(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper_sharded_bucketized().

286  {
287  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
288  : SUFFIX(fill_one_to_one_hashtable);
289  auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {
290  auto entry_ptr =
292  elem,
293  type_info.min_val,
294  shard_info.entry_count_per_shard,
295  shard,
296  shard_info.num_shards,
297  shard_info.device_count,
298  bucket_normalization);
299  return filling_func(index, entry_ptr, invalid_slot_val);
300  };
301 
303  join_column,
304  type_info,
305  shard_info,
306  sd_inner_to_outer_translation_map,
307  min_inner_elem,
308  cpu_thread_idx,
309  cpu_thread_count,
310  hashtable_filling_func);
311 }
const size_t num_shards
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:109
#define SUFFIX(name)
const int device_count
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
const size_t entry_count_per_shard
const int64_t min_val
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename HASHTABLE_FILLING_FUNC >
DEVICE int fill_hash_join_buff_sharded_impl ( int32_t *  buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
HASHTABLE_FILLING_FUNC  filling_func 
)

Definition at line 222 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, ShardInfo::num_shards, ShardInfo::shard, SHARD_FOR_KEY, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_sharded(), and fill_hash_join_buff_sharded_bucketized().

231  {
232 #ifdef __CUDACC__
233  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
234  int32_t step = blockDim.x * gridDim.x;
235 #else
236  int32_t start = cpu_thread_idx;
237  int32_t step = cpu_thread_count;
238 #endif
239  JoinColumnTyped col{&join_column, &type_info};
240  for (auto item : col.slice(start, step)) {
241  const size_t index = item.index;
242  int64_t elem = item.element;
243  size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);
244  if (shard != shard_info.shard) {
245  continue;
246  }
247  if (elem == type_info.null_val) {
248  if (type_info.uses_bw_eq) {
249  elem = type_info.translated_null_val;
250  } else {
251  continue;
252  }
253  }
254 #ifndef __CUDACC__
255  if (sd_inner_to_outer_translation_map &&
256  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
257  const auto outer_id = map_str_id_to_outer_dict(elem,
258  min_inner_elem,
259  type_info.min_val,
260  type_info.max_val,
261  sd_inner_to_outer_translation_map);
262  if (outer_id == StringDictionary::INVALID_STR_ID) {
263  continue;
264  }
265  elem = outer_id;
266  }
267 #endif
268  if (filling_func(elem, shard, index)) {
269  return -1;
270  }
271  }
272  return 0;
273 }
const size_t num_shards
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const size_t shard
const int64_t max_val
const int64_t min_val
#define SHARD_FOR_KEY(key, num_shards)
Definition: shard_key.h:20

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
void fill_one_to_many_baseline_hash_table ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const size_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 1887 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, count_matches_baseline(), fill_row_ids_baseline(), inclusive_scan(), and SUFFIX.

1899  {
1900  int32_t* pos_buff = buff;
1901  int32_t* count_buff = buff + hash_entry_count;
1902  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1903  std::vector<std::future<void>> counter_threads;
1904  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1905  if (is_range_join) {
1906  counter_threads.push_back(std::async(
1908  [count_buff,
1909  composite_key_dict,
1910  &hash_entry_count,
1911  &join_buckets_per_key,
1912  &join_column_per_key,
1913  &is_geo_compressed,
1914  cpu_thread_idx,
1915  cpu_thread_count] {
1916  const auto key_handler = RangeKeyHandler(
1917  is_geo_compressed,
1918  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1919  &join_column_per_key[0],
1920  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1921  count_matches_baseline(count_buff,
1922  composite_key_dict,
1923  hash_entry_count,
1924  &key_handler,
1925  join_column_per_key[0].num_elems,
1926  cpu_thread_idx,
1927  cpu_thread_count);
1928  }));
1929  } else if (join_buckets_per_key.size() > 0) {
1930  counter_threads.push_back(std::async(
1932  [count_buff,
1933  composite_key_dict,
1934  &hash_entry_count,
1935  &join_buckets_per_key,
1936  &join_column_per_key,
1937  cpu_thread_idx,
1938  cpu_thread_count] {
1939  const auto key_handler = OverlapsKeyHandler(
1940  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
1941  &join_column_per_key[0],
1942  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
1943  count_matches_baseline(count_buff,
1944  composite_key_dict,
1945  hash_entry_count,
1946  &key_handler,
1947  join_column_per_key[0].num_elems,
1948  cpu_thread_idx,
1949  cpu_thread_count);
1950  }));
1951  } else {
1952  counter_threads.push_back(
1954  [count_buff,
1955  composite_key_dict,
1956  &key_component_count,
1957  &hash_entry_count,
1958  &join_column_per_key,
1959  &type_info_per_key,
1960  &sd_inner_to_outer_translation_maps,
1961  &sd_min_inner_elems,
1962  cpu_thread_idx,
1963  cpu_thread_count] {
1964  const auto key_handler =
1965  GenericKeyHandler(key_component_count,
1966  true,
1967  &join_column_per_key[0],
1968  &type_info_per_key[0],
1969  &sd_inner_to_outer_translation_maps[0],
1970  &sd_min_inner_elems[0]);
1971  count_matches_baseline(count_buff,
1972  composite_key_dict,
1973  hash_entry_count,
1974  &key_handler,
1975  join_column_per_key[0].num_elems,
1976  cpu_thread_idx,
1977  cpu_thread_count);
1978  }));
1979  }
1980  }
1981 
1982  for (auto& child : counter_threads) {
1983  child.get();
1984  }
1985 
1986  std::vector<int32_t> count_copy(hash_entry_count, 0);
1987  CHECK_GT(hash_entry_count, int64_t(0));
1988  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1990  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1991  std::vector<std::future<void>> pos_threads;
1992  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1993  pos_threads.push_back(std::async(
1995  [&](const int thread_idx) {
1996  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1997  if (count_buff[i]) {
1998  pos_buff[i] = count_copy[i];
1999  }
2000  }
2001  },
2002  cpu_thread_idx));
2003  }
2004  for (auto& child : pos_threads) {
2005  child.get();
2006  }
2007 
2008  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
2009  std::vector<std::future<void>> rowid_threads;
2010  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
2011  if (is_range_join) {
2012  rowid_threads.push_back(std::async(
2014  [buff,
2015  composite_key_dict,
2016  hash_entry_count,
2017  &join_column_per_key,
2018  &join_buckets_per_key,
2019  &is_geo_compressed,
2020  cpu_thread_idx,
2021  cpu_thread_count] {
2022  const auto key_handler = RangeKeyHandler(
2023  is_geo_compressed,
2024  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2025  &join_column_per_key[0],
2026  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2028  (buff,
2029  composite_key_dict,
2030  hash_entry_count,
2031  &key_handler,
2032  join_column_per_key[0].num_elems,
2033  cpu_thread_idx,
2034  cpu_thread_count);
2035  }));
2036  } else if (join_buckets_per_key.size() > 0) {
2037  rowid_threads.push_back(std::async(
2039  [buff,
2040  composite_key_dict,
2041  hash_entry_count,
2042  &join_column_per_key,
2043  &join_buckets_per_key,
2044  cpu_thread_idx,
2045  cpu_thread_count] {
2046  const auto key_handler = OverlapsKeyHandler(
2047  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2048  &join_column_per_key[0],
2049  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2051  (buff,
2052  composite_key_dict,
2053  hash_entry_count,
2054  &key_handler,
2055  join_column_per_key[0].num_elems,
2056  cpu_thread_idx,
2057  cpu_thread_count);
2058  }));
2059  } else {
2060  rowid_threads.push_back(std::async(std::launch::async,
2061  [buff,
2062  composite_key_dict,
2063  hash_entry_count,
2064  key_component_count,
2065  &join_column_per_key,
2066  &type_info_per_key,
2067  &sd_inner_to_outer_translation_maps,
2068  &sd_min_inner_elems,
2069  cpu_thread_idx,
2070  cpu_thread_count] {
2071  const auto key_handler = GenericKeyHandler(
2072  key_component_count,
2073  true,
2074  &join_column_per_key[0],
2075  &type_info_per_key[0],
2076  &sd_inner_to_outer_translation_maps[0],
2077  &sd_min_inner_elems[0]);
2079  (buff,
2080  composite_key_dict,
2081  hash_entry_count,
2082  &key_handler,
2083  join_column_per_key[0].num_elems,
2084  cpu_thread_idx,
2085  cpu_thread_count);
2086  }));
2087  }
2088  }
2089 
2090  for (auto& child : rowid_threads) {
2091  child.get();
2092  }
2093 }
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:234
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() fill_row_ids_baseline(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() count_matches_baseline(int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 2095 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2107  {
2108  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2109  composite_key_dict,
2110  hash_entry_count,
2111  key_component_count,
2112  join_column_per_key,
2113  type_info_per_key,
2114  join_bucket_info,
2115  sd_inner_to_outer_translation_maps,
2116  sd_min_inner_elems,
2117  cpu_thread_count,
2118  is_range_join,
2119  is_geo_compressed);
2120 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join,
const bool  is_geo_compressed 
)

Definition at line 2122 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2134  {
2135  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2136  composite_key_dict,
2137  hash_entry_count,
2138  key_component_count,
2139  join_column_per_key,
2140  type_info_per_key,
2141  join_bucket_info,
2142  sd_inner_to_outer_translation_maps,
2143  sd_min_inner_elems,
2144  cpu_thread_count,
2145  is_range_join,
2146  is_geo_compressed);
2147 }

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1458 of file HashJoinRuntime.cpp.

References count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1464  {
1465  auto timer = DEBUG_TIMER(__func__);
1466  auto launch_count_matches = [count_buff = buff + hash_entry_info.hash_entry_count,
1467  &join_column,
1468  &type_info,
1469  sd_inner_to_outer_translation_map,
1470  min_inner_elem](auto cpu_thread_idx,
1471  auto cpu_thread_count) {
1473  (count_buff,
1474  join_column,
1475  type_info,
1476  sd_inner_to_outer_translation_map,
1477  min_inner_elem,
1478  cpu_thread_idx,
1479  cpu_thread_count);
1480  };
1481  auto launch_fill_row_ids = [hash_entry_count = hash_entry_info.hash_entry_count,
1482  buff,
1483  &join_column,
1484  &type_info,
1485  sd_inner_to_outer_translation_map,
1486  min_inner_elem](auto cpu_thread_idx,
1487  auto cpu_thread_count) {
1489  (buff,
1490  hash_entry_count,
1491  join_column,
1492  type_info,
1493  sd_inner_to_outer_translation_map,
1494  min_inner_elem,
1495  cpu_thread_idx,
1496  cpu_thread_count);
1497  };
1498 
1500  hash_entry_info.hash_entry_count,
1501  join_column,
1502  type_info,
1503  sd_inner_to_outer_translation_map,
1504  min_inner_elem,
1505  cpu_thread_count,
1506  launch_count_matches,
1507  launch_fill_row_ids);
1508 }
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
size_t hash_entry_count
#define DEBUG_TIMER(name)
Definition: Logger.h:371

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1510 of file HashJoinRuntime.cpp.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1517  {
1518  auto timer = DEBUG_TIMER(__func__);
1519  auto bucket_normalization = hash_entry_info.bucket_normalization;
1520  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1521  auto launch_count_matches = [bucket_normalization,
1522  count_buff = buff + hash_entry_count,
1523  &join_column,
1524  &type_info,
1525  sd_inner_to_outer_translation_map,
1526  min_inner_elem](auto cpu_thread_idx,
1527  auto cpu_thread_count) {
1529  (count_buff,
1530  join_column,
1531  type_info,
1532  sd_inner_to_outer_translation_map,
1533  min_inner_elem,
1534  cpu_thread_idx,
1535  cpu_thread_count,
1536  bucket_normalization);
1537  };
1538  auto launch_fill_row_ids = [bucket_normalization,
1539  hash_entry_count,
1540  buff,
1541  &join_column,
1542  &type_info,
1543  sd_inner_to_outer_translation_map,
1544  min_inner_elem](auto cpu_thread_idx,
1545  auto cpu_thread_count) {
1547  (buff,
1548  hash_entry_count,
1549  join_column,
1550  type_info,
1551  sd_inner_to_outer_translation_map,
1552  min_inner_elem,
1553  cpu_thread_idx,
1554  cpu_thread_count,
1555  bucket_normalization);
1556  };
1557 
1559  hash_entry_count,
1560  join_column,
1561  type_info,
1562  sd_inner_to_outer_translation_map,
1563  min_inner_elem,
1564  cpu_thread_count,
1565  launch_count_matches,
1566  launch_fill_row_ids);
1567 }
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const
#define DEBUG_TIMER(name)
Definition: Logger.h:371
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_func,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_func 
)

Definition at line 1397 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, and inclusive_scan().

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_bucketized().

1405  {
1406  auto timer = DEBUG_TIMER(__func__);
1407  int32_t* pos_buff = buff;
1408  int32_t* count_buff = buff + hash_entry_count;
1409  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1410  std::vector<std::future<void>> counter_threads;
1411  for (unsigned cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1412  counter_threads.push_back(std::async(
1413  std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));
1414  }
1415 
1416  for (auto& child : counter_threads) {
1417  child.get();
1418  }
1419 
1420  std::vector<int32_t> count_copy(hash_entry_count, 0);
1421  CHECK_GT(hash_entry_count, int64_t(0));
1422  memcpy(count_copy.data() + 1, count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1423 #if HAVE_CUDA
1424  thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());
1425 #else
1427  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1428 #endif
1429  std::vector<std::future<void>> pos_threads;
1430  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1431  pos_threads.push_back(std::async(
1433  [&](size_t thread_idx) {
1434  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1435  if (count_buff[i]) {
1436  pos_buff[i] = count_copy[i];
1437  }
1438  }
1439  },
1440  cpu_thread_idx));
1441  }
1442  for (auto& child : pos_threads) {
1443  child.get();
1444  }
1445 
1446  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1447  std::vector<std::future<void>> rowid_threads;
1448  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1449  rowid_threads.push_back(std::async(
1450  std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));
1451  }
1452 
1453  for (auto& child : rowid_threads) {
1454  child.get();
1455  }
1456 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:234
future< Result > async(Fn &&fn, Args &&...args)
#define DEBUG_TIMER(name)
Definition: Logger.h:371

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1629 of file HashJoinRuntime.cpp.

References count_matches_sharded(), fill_one_to_many_hash_table_sharded_impl(), fill_row_ids_sharded(), and SUFFIX.

1636  {
1637  auto launch_count_matches = [count_buff = buff + hash_entry_count,
1638  &join_column,
1639  &type_info,
1640  &shard_info
1641 #ifndef __CUDACC__
1642  ,
1643  sd_inner_to_outer_translation_map,
1644  min_inner_elem
1645 #endif
1646  ](auto cpu_thread_idx, auto cpu_thread_count) {
1647  return SUFFIX(count_matches_sharded)(count_buff,
1648  join_column,
1649  type_info,
1650  shard_info
1651 #ifndef __CUDACC__
1652  ,
1653  sd_inner_to_outer_translation_map,
1654  min_inner_elem,
1655  cpu_thread_idx,
1656  cpu_thread_count
1657 #endif
1658  );
1659  };
1660 
1661  auto launch_fill_row_ids = [buff,
1662  hash_entry_count,
1663  &join_column,
1664  &type_info,
1665  &shard_info
1666 #ifndef __CUDACC__
1667  ,
1668  sd_inner_to_outer_translation_map,
1669  min_inner_elem
1670 #endif
1671  ](auto cpu_thread_idx, auto cpu_thread_count) {
1672  return SUFFIX(fill_row_ids_sharded)(buff,
1673  hash_entry_count,
1674  join_column,
1675  type_info,
1676  shard_info
1677 #ifndef __CUDACC__
1678  ,
1679  sd_inner_to_outer_translation_map,
1680  min_inner_elem,
1681  cpu_thread_idx,
1682  cpu_thread_count);
1683 #endif
1684  };
1685 
1687  hash_entry_count,
1688  join_column,
1689  type_info,
1690  shard_info
1691 #ifndef __CUDACC__
1692  ,
1693  sd_inner_to_outer_translation_map,
1694  min_inner_elem,
1695  cpu_thread_count
1696 #endif
1697  ,
1698  launch_count_matches,
1699  launch_fill_row_ids);
1700 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_LAUNCH_FUNCTOR , typename FILL_ROW_IDS_LAUNCH_FUNCTOR >
void fill_one_to_many_hash_table_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count,
COUNT_MATCHES_LAUNCH_FUNCTOR  count_matches_launcher,
FILL_ROW_IDS_LAUNCH_FUNCTOR  fill_row_ids_launcher 
)

Definition at line 1570 of file HashJoinRuntime.cpp.

References threading_serial::async(), CHECK_GT, DEBUG_TIMER, and inclusive_scan().

Referenced by fill_one_to_many_hash_table_sharded().

1580  {
1581  auto timer = DEBUG_TIMER(__func__);
1582  int32_t* pos_buff = buff;
1583  int32_t* count_buff = buff + hash_entry_count;
1584  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1585  std::vector<std::future<void>> counter_threads;
1586  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1587  counter_threads.push_back(std::async(
1588  std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));
1589  }
1590 
1591  for (auto& child : counter_threads) {
1592  child.get();
1593  }
1594 
1595  std::vector<int32_t> count_copy(hash_entry_count, 0);
1596  CHECK_GT(hash_entry_count, int64_t(0));
1597  memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));
1599  count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);
1600  std::vector<std::future<void>> pos_threads;
1601  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1602  pos_threads.push_back(std::async(
1604  [&](const unsigned thread_idx) {
1605  for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {
1606  if (count_buff[i]) {
1607  pos_buff[i] = count_copy[i];
1608  }
1609  }
1610  },
1611  cpu_thread_idx));
1612  }
1613  for (auto& child : pos_threads) {
1614  child.get();
1615  }
1616 
1617  memset(count_buff, 0, hash_entry_count * sizeof(int32_t));
1618  std::vector<std::future<void>> rowid_threads;
1619  for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {
1620  rowid_threads.push_back(std::async(
1621  std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));
1622  }
1623 
1624  for (auto& child : rowid_threads) {
1625  child.get();
1626  }
1627 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
#define CHECK_GT(x, y)
Definition: Logger.h:234
future< Result > async(Fn &&fn, Args &&...args)
#define DEBUG_TIMER(name)
Definition: Logger.h:371

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 950 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table(), and fill_one_to_many_hash_table_on_device().

961  {
962  auto slot_sel = [&type_info](auto pos_buff, auto elem) {
963  return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);
964  };
965 
966  fill_row_ids_impl(buff,
967  hash_entry_count,
968  join_column,
969  type_info
970 #ifndef __CUDACC__
971  ,
972  sd_inner_to_outer_translation_map,
973  min_inner_elem,
974  cpu_thread_idx,
975  cpu_thread_count
976 #endif
977  ,
978  slot_sel);
979 }
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:74
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
GLOBAL void SUFFIX() fill_row_ids_baseline ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const KEY_HANDLER *  f,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1152 of file HashJoinRuntime.cpp.

References g_maximum_conditions_to_coalesce, get_matching_baseline_hash_slot_readonly(), mapd_add, SUFFIX, and heavydb.dtypes::T.

Referenced by fill_one_to_many_baseline_hash_table().

1162  {
1163  int32_t* pos_buff = buff;
1164  int32_t* count_buff = buff + hash_entry_count;
1165  int32_t* id_buff = count_buff + hash_entry_count;
1166 #ifdef __CUDACC__
1167  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1168  int32_t step = blockDim.x * gridDim.x;
1169 #else
1170  int32_t start = cpu_thread_idx;
1171  int32_t step = cpu_thread_count;
1172 #endif
1173 
1174  T key_scratch_buff[g_maximum_conditions_to_coalesce];
1175 #ifdef __CUDACC__
1176  assert(composite_key_dict);
1177 #endif
1178  const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);
1179  auto key_buff_handler = [composite_key_dict,
1180  hash_entry_count,
1181  pos_buff,
1182  count_buff,
1183  id_buff,
1184  key_size_in_bytes](const int64_t row_index,
1185  const T* key_scratch_buff,
1186  const size_t key_component_count) {
1187  const T* matching_group =
1189  key_component_count,
1190  composite_key_dict,
1191  hash_entry_count,
1192  key_size_in_bytes);
1193  const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;
1194  int32_t* pos_ptr = pos_buff + entry_idx;
1195  const auto bin_idx = pos_ptr - pos_buff;
1196  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1197  id_buff[id_buff_idx] = static_cast<int32_t>(row_index);
1198  return 0;
1199  };
1200 
1201  JoinColumnTuple cols(
1202  f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());
1203  for (auto& it : cols.slice(start, step)) {
1204  (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);
1205  }
1206  return;
1207 }
#define SUFFIX(name)
constexpr double f
Definition: Utm.h:31
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
#define mapd_add(address, val)
const size_t g_maximum_conditions_to_coalesce

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 981 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_bucketized(), and fill_one_to_many_hash_table_on_device_bucketized().

994  {
995  auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {
997  pos_buff, elem, type_info.min_val, bucket_normalization);
998  };
999  fill_row_ids_impl(buff,
1000  hash_entry_count,
1001  join_column,
1002  type_info
1003 #ifndef __CUDACC__
1004  ,
1005  sd_inner_to_outer_translation_map,
1006  min_inner_elem,
1007  cpu_thread_idx,
1008  cpu_thread_count
1009 #endif
1010  ,
1011  slot_sel);
1012 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
#define SUFFIX(name)
const int64_t min_val

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 894 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), and mapd_add.

Referenced by fill_row_ids(), fill_row_ids_bucketized(), fill_row_ids_sharded(), and fill_row_ids_sharded_bucketized().

906  {
907  int32_t* pos_buff = buff;
908  int32_t* count_buff = buff + hash_entry_count;
909  int32_t* id_buff = count_buff + hash_entry_count;
910 
911 #ifdef __CUDACC__
912  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
913  int32_t step = blockDim.x * gridDim.x;
914 #else
915  int32_t start = cpu_thread_idx;
916  int32_t step = cpu_thread_count;
917 #endif
918  JoinColumnTyped col{&join_column, &type_info};
919  for (auto item : col.slice(start, step)) {
920  const size_t index = item.index;
921  int64_t elem = item.element;
922  if (elem == type_info.null_val) {
923  if (type_info.uses_bw_eq) {
924  elem = type_info.translated_null_val;
925  } else {
926  continue;
927  }
928  }
929 #ifndef __CUDACC__
930  if (sd_inner_to_outer_translation_map &&
931  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
932  const auto outer_id = map_str_id_to_outer_dict(elem,
933  min_inner_elem,
934  type_info.min_val,
935  type_info.max_val,
936  sd_inner_to_outer_translation_map);
937  if (outer_id == StringDictionary::INVALID_STR_ID) {
938  continue;
939  }
940  elem = outer_id;
941  }
942 #endif
943  auto pos_ptr = slot_selector(pos_buff, elem);
944  const auto bin_idx = pos_ptr - pos_buff;
945  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
946  id_buff[id_buff_idx] = static_cast<int32_t>(index);
947  }
948 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1073 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_hash_slot_sharded(), and SUFFIX.

Referenced by fill_one_to_many_hash_table_on_device_sharded(), and fill_one_to_many_hash_table_sharded().

1085  {
1086  auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {
1087  return SUFFIX(get_hash_slot_sharded)(pos_buff,
1088  elem,
1089  type_info.min_val,
1090  shard_info.entry_count_per_shard,
1091  shard_info.num_shards,
1092  shard_info.device_count);
1093  };
1094 
1095  fill_row_ids_impl(buff,
1096  hash_entry_count,
1097  join_column,
1098  type_info
1099 #ifndef __CUDACC__
1100  ,
1101  sd_inner_to_outer_translation_map,
1102  min_inner_elem,
1103  cpu_thread_idx,
1104  cpu_thread_count
1105 #endif
1106  ,
1107  slot_sel);
1108 }
const size_t num_shards
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
const int64_t min_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:95

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 1110 of file HashJoinRuntime.cpp.

References fill_row_ids_impl(), get_bucketized_hash_slot_sharded(), and SUFFIX.

1124  {
1125  auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,
1126  auto elem) {
1127  return SUFFIX(get_bucketized_hash_slot_sharded)(pos_buff,
1128  elem,
1129  type_info.min_val,
1130  shard_info.entry_count_per_shard,
1131  shard_info.num_shards,
1132  shard_info.device_count,
1133  bucket_normalization);
1134  };
1135 
1136  fill_row_ids_impl(buff,
1137  hash_entry_count,
1138  join_column,
1139  type_info
1140 #ifndef __CUDACC__
1141  ,
1142  sd_inner_to_outer_translation_map,
1143  min_inner_elem,
1144  cpu_thread_idx,
1145  cpu_thread_count
1146 #endif
1147  ,
1148  slot_sel);
1149 }
const size_t num_shards
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
#define SUFFIX(name)
const int device_count
const size_t entry_count_per_shard
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:80

+ Here is the call graph for this function:

template<typename SLOT_SELECTOR >
DEVICE void fill_row_ids_sharded_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
SLOT_SELECTOR  slot_selector 
)

Definition at line 1015 of file HashJoinRuntime.cpp.

References StringDictionary::INVALID_STR_ID, anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict(), mapd_add, JoinColumnTypeInfo::max_val, JoinColumnTypeInfo::min_val, JoinColumnTypeInfo::null_val, JoinColumnTypeInfo::translated_null_val, and JoinColumnTypeInfo::uses_bw_eq.

1028  {
1029 
1030  int32_t* pos_buff = buff;
1031  int32_t* count_buff = buff + hash_entry_count;
1032  int32_t* id_buff = count_buff + hash_entry_count;
1033 
1034 #ifdef __CUDACC__
1035  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
1036  int32_t step = blockDim.x * gridDim.x;
1037 #else
1038  int32_t start = cpu_thread_idx;
1039  int32_t step = cpu_thread_count;
1040 #endif
1041  JoinColumnTyped col{&join_column, &type_info};
1042  for (auto item : col.slice(start, step)) {
1043  const size_t index = item.index;
1044  int64_t elem = item.element;
1045  if (elem == type_info.null_val) {
1046  if (type_info.uses_bw_eq) {
1047  elem = type_info.translated_null_val;
1048  } else {
1049  continue;
1050  }
1051  }
1052 #ifndef __CUDACC__
1053  if (sd_inner_to_outer_translation_map &&
1054  (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {
1055  const auto outer_id = map_str_id_to_outer_dict(elem,
1056  min_inner_elem,
1057  type_info.min_val,
1058  type_info.max_val,
1059  sd_inner_to_outer_translation_map);
1060  if (outer_id == StringDictionary::INVALID_STR_ID) {
1061  continue;
1062  }
1063  elem = outer_id;
1064  }
1065 #endif
1066  auto* pos_ptr = slot_selector(pos_buff, elem);
1067  const auto bin_idx = pos_ptr - pos_buff;
1068  const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;
1069  id_buff[id_buff_idx] = static_cast<int32_t>(index);
1070  }
1071 }
const int64_t null_val
const int64_t translated_null_val
static constexpr int32_t INVALID_STR_ID
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
const int64_t max_val
const int64_t min_val
#define mapd_add(address, val)

+ Here is the call graph for this function:

template<typename T >
T* get_matching_baseline_hash_slot_at ( int8_t *  hash_buff,
const uint32_t  h,
const T *  key,
const size_t  key_component_count,
const int64_t  hash_entry_size 
)

Definition at line 468 of file HashJoinRuntime.cpp.

References cas_cst, get_invalid_key(), load_cst, store_cst, SUFFIX, heavydb.dtypes::T, and UNLIKELY.

Referenced by write_baseline_hash_slot(), and write_baseline_hash_slot_for_semi_join().

472  {
473  uint32_t off = h * hash_entry_size;
474  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
475  T empty_key = SUFFIX(get_invalid_key)<T>();
476  T write_pending = SUFFIX(get_invalid_key)<T>() - 1;
477  if (UNLIKELY(*key == write_pending)) {
478  // Address the singularity case where the first column contains the pending
479  // write special value. Should never happen, but avoid doing wrong things.
480  return nullptr;
481  }
482  const bool success = cas_cst(row_ptr, &empty_key, write_pending);
483  if (success) {
484  if (key_component_count > 1) {
485  memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));
486  }
487  store_cst(row_ptr, *key);
488  return reinterpret_cast<T*>(row_ptr + key_component_count);
489  }
490  while (load_cst(row_ptr) == write_pending) {
491  // spin until the winning thread has finished writing the entire key
492  }
493  for (size_t i = 0; i < key_component_count; ++i) {
494  if (load_cst(row_ptr + i) != key[i]) {
495  return nullptr;
496  }
497  }
498  return reinterpret_cast<T*>(row_ptr + key_component_count);
499 }
#define SUFFIX(name)
#define load_cst(ptr)
DEVICE T SUFFIX() get_invalid_key()
#define cas_cst(ptr, expected, desired)
#define UNLIKELY(x)
Definition: likely.h:25
#define store_cst(ptr, val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE NEVER_INLINE const T* SUFFIX() get_matching_baseline_hash_slot_readonly ( const T *  key,
const size_t  key_component_count,
const T *  composite_key_dict,
const int64_t  entry_count,
const size_t  key_size_in_bytes 
)

Definition at line 818 of file HashJoinRuntime.cpp.

References CHECK, keys_are_equal(), and MurmurHash1Impl().

Referenced by count_matches_baseline(), and fill_row_ids_baseline().

823  {
824  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
825  uint32_t off = h * key_component_count;
826  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
827  return &composite_key_dict[off];
828  }
829  uint32_t h_probe = (h + 1) % entry_count;
830  while (h_probe != h) {
831  off = h_probe * key_component_count;
832  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
833  return &composite_key_dict[off];
834  }
835  h_probe = (h_probe + 1) % entry_count;
836  }
837 #ifndef __CUDACC__
838  CHECK(false);
839 #else
840  assert(false);
841 #endif
842  return nullptr;
843 }
bool keys_are_equal(const T *key1, const T *key2, const size_t key_component_count)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename InputIterator , typename OutputIterator >
void inclusive_scan ( InputIterator  first,
InputIterator  last,
OutputIterator  out,
const size_t  thread_count 
)

Definition at line 1325 of file HashJoinRuntime.cpp.

References threading_serial::async(), and gpu_enabled::partial_sum().

Referenced by approximate_distinct_tuples_on_device_overlaps(), approximate_distinct_tuples_on_device_range(), approximate_distinct_tuples_overlaps(), approximate_distinct_tuples_range(), fill_one_to_many_baseline_hash_table(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_impl(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), fill_one_to_many_hash_table_sharded_impl(), and gpu_enabled::partial_sum().

1328  {
1329  using ElementType = typename InputIterator::value_type;
1330  using OffsetType = typename InputIterator::difference_type;
1331  const OffsetType elem_count = last - first;
1332  if (elem_count < 10000 || thread_count <= 1) {
1333  ElementType sum = 0;
1334  for (auto iter = first; iter != last; ++iter, ++out) {
1335  *out = sum += *iter;
1336  }
1337  return;
1338  }
1339 
1340  const OffsetType step = (elem_count + thread_count - 1) / thread_count;
1341  OffsetType start_off = 0;
1342  OffsetType end_off = std::min(step, elem_count);
1343  std::vector<ElementType> partial_sums(thread_count);
1344  std::vector<std::future<void>> counter_threads;
1345  for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,
1346  start_off = std::min(start_off + step, elem_count),
1347  end_off = std::min(start_off + step, elem_count)) {
1348  counter_threads.push_back(std::async(
1350  [first, out](
1351  ElementType& partial_sum, const OffsetType start, const OffsetType end) {
1352  ElementType sum = 0;
1353  for (auto in_iter = first + start, out_iter = out + start;
1354  in_iter != (first + end);
1355  ++in_iter, ++out_iter) {
1356  *out_iter = sum += *in_iter;
1357  }
1358  partial_sum = sum;
1359  },
1360  std::ref(partial_sums[thread_idx]),
1361  start_off,
1362  end_off));
1363  }
1364  for (auto& child : counter_threads) {
1365  child.get();
1366  }
1367 
1368  ElementType sum = 0;
1369  for (auto& s : partial_sums) {
1370  s += sum;
1371  sum = s;
1372  }
1373 
1374  counter_threads.clear();
1375  start_off = std::min(step, elem_count);
1376  end_off = std::min(start_off + step, elem_count);
1377  for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,
1378  start_off = std::min(start_off + step, elem_count),
1379  end_off = std::min(start_off + step, elem_count)) {
1380  counter_threads.push_back(std::async(
1382  [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {
1383  for (auto iter = out + start; iter != (out + end); ++iter) {
1384  *iter += prev_sum;
1385  }
1386  },
1387  partial_sums[thread_idx],
1388  start_off,
1389  end_off));
1390  }
1391  for (auto& child : counter_threads) {
1392  child.get();
1393  }
1394 }
future< Result > async(Fn &&fn, Args &&...args)
DEVICE void partial_sum(ARGS &&...args)
Definition: gpu_enabled.h:87

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
DEVICE void SUFFIX() init_baseline_hash_join_buff ( int8_t *  hash_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 349 of file HashJoinRuntime.cpp.

References get_invalid_key(), SUFFIX, and heavydb.dtypes::T.

Referenced by init_baseline_hash_join_buff_wrapper().

355  {
356 #ifdef __CUDACC__
357  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
358  int32_t step = blockDim.x * gridDim.x;
359 #else
360  int32_t start = cpu_thread_idx;
361  int32_t step = cpu_thread_count;
362 #endif
363  auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);
364  const T empty_key = SUFFIX(get_invalid_key)<T>();
365  for (int64_t h = start; h < entry_count; h += step) {
366  int64_t off = h * hash_entry_size;
367  auto row_ptr = reinterpret_cast<T*>(hash_buff + off);
368  for (size_t i = 0; i < key_component_count; ++i) {
369  row_ptr[i] = empty_key;
370  }
371  if (with_val_slot) {
372  row_ptr[key_component_count] = invalid_slot_val;
373  }
374  }
375 }
#define SUFFIX(name)
DEVICE T SUFFIX() get_invalid_key()

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1702 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1708  {
1709  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1710  entry_count,
1711  key_component_count,
1712  with_val_slot,
1713  invalid_slot_val,
1714  cpu_thread_idx,
1715  cpu_thread_count);
1716 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1718 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1724  {
1725  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1726  entry_count,
1727  key_component_count,
1728  with_val_slot,
1729  invalid_slot_val,
1730  cpu_thread_idx,
1731  cpu_thread_count);
1732 }

+ Here is the caller graph for this function:

DEVICE void SUFFIX() init_hash_join_buff ( int32_t *  groups_buffer,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 70 of file HashJoinRuntime.cpp.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

74  {
75 #ifdef __CUDACC__
76  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
77  int32_t step = blockDim.x * gridDim.x;
78 #else
79  int32_t start = cpu_thread_idx;
80  int32_t step = cpu_thread_count;
81 #endif
82  for (int64_t i = start; i < hash_entry_count; i += step) {
83  groups_buffer[i] = invalid_slot_val;
84  }
85 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1780 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1788  {
1789  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1790  entry_count,
1791  invalid_slot_val,
1792  false,
1793  key_component_count,
1794  with_val_slot,
1795  key_handler,
1796  num_elems,
1797  cpu_thread_idx,
1798  cpu_thread_count);
1799 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1844 of file HashJoinRuntime.cpp.

1852  {
1853  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1854  entry_count,
1855  invalid_slot_val,
1856  false,
1857  key_component_count,
1858  with_val_slot,
1859  key_handler,
1860  num_elems,
1861  cpu_thread_idx,
1862  cpu_thread_count);
1863 }
int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1801 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1809  {
1810  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1811  entry_count,
1812  invalid_slot_val,
1813  false,
1814  key_component_count,
1815  with_val_slot,
1816  key_handler,
1817  num_elems,
1818  cpu_thread_idx,
1819  cpu_thread_count);
1820 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1865 of file HashJoinRuntime.cpp.

1873  {
1874  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1875  entry_count,
1876  invalid_slot_val,
1877  false,
1878  key_component_count,
1879  with_val_slot,
1880  key_handler,
1881  num_elems,
1882  cpu_thread_idx,
1883  cpu_thread_count);
1884 }
template<typename T >
DEVICE int write_baseline_hash_slot ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 508 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and heavydb.dtypes::T.

516  {
517  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
518  T* matching_group = get_matching_baseline_hash_slot_at(
519  hash_buff, h, key, key_component_count, hash_entry_size);
520  if (!matching_group) {
521  uint32_t h_probe = (h + 1) % entry_count;
522  while (h_probe != h) {
523  matching_group = get_matching_baseline_hash_slot_at(
524  hash_buff, h_probe, key, key_component_count, hash_entry_size);
525  if (matching_group) {
526  break;
527  }
528  h_probe = (h_probe + 1) % entry_count;
529  }
530  }
531  if (!matching_group) {
532  return -2;
533  }
534  if (!with_val_slot) {
535  return 0;
536  }
537  if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {
538  return -1;
539  }
540  return 0;
541 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function:

template<typename T >
DEVICE int write_baseline_hash_slot_for_semi_join ( const int32_t  val,
int8_t *  hash_buff,
const int64_t  entry_count,
const T *  key,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const size_t  key_size_in_bytes,
const size_t  hash_entry_size 
)

Definition at line 544 of file HashJoinRuntime.cpp.

References get_matching_baseline_hash_slot_at(), mapd_cas, MurmurHash1Impl(), and heavydb.dtypes::T.

552  {
553  const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;
554  T* matching_group = get_matching_baseline_hash_slot_at(
555  hash_buff, h, key, key_component_count, hash_entry_size);
556  if (!matching_group) {
557  uint32_t h_probe = (h + 1) % entry_count;
558  while (h_probe != h) {
559  matching_group = get_matching_baseline_hash_slot_at(
560  hash_buff, h_probe, key, key_component_count, hash_entry_size);
561  if (matching_group) {
562  break;
563  }
564  h_probe = (h_probe + 1) % entry_count;
565  }
566  }
567  if (!matching_group) {
568  return -2;
569  }
570  if (!with_val_slot) {
571  return 0;
572  }
573  mapd_cas(matching_group, invalid_slot_val, val);
574  return 0;
575 }
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6
#define mapd_cas(address, compare, val)

+ Here is the call graph for this function: