OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.h File Reference
#include <cstddef>
#include <cstdint>
#include <vector>
#include "../../../Shared/SqlTypesLayout.h"
#include "../../../Shared/sqltypes.h"
#include "../../RuntimeFunctions.h"
#include "../../../Shared/funcannotations.h"
+ Include dependency graph for HashJoinRuntime.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  HashEntryInfo
 
struct  JoinChunk
 
struct  JoinColumn
 
struct  JoinColumnTypeInfo
 
struct  JoinBucketInfo
 
struct  ShardInfo
 

Enumerations

enum  ColumnType { SmallDate = 0, Signed = 1, Unsigned = 2, Double = 3 }
 

Functions

void init_hash_join_buff (int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
ColumnType get_join_column_type_kind (const SQLTypeInfo &ti)
 
int fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner, const void *sd_outer, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
int fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner, const void *sd_outer, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void fill_hash_join_buff_on_device (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
 
void fill_hash_join_buff_on_device_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
 
void fill_hash_join_buff_on_device_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
 
void fill_one_to_many_hash_table (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_sharded_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_on_device (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_sharded (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count, const bool is_range_join=false, const bool is_geo_compressed=false)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void * > &sd_inner_proxy_per_key, const std::vector< const void * > &sd_outer_proxy_per_key, const int32_t cpu_thread_count, const bool is_range_join=false, const bool is_geo_compressed=false)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_overlaps (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 
void approximate_distinct_tuples_on_device_range (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
 

Variables

const size_t g_maximum_conditions_to_coalesce {8}
 

Enumeration Type Documentation

enum ColumnType
Enumerator
SmallDate 
Signed 
Unsigned 
Double 

Definition at line 101 of file HashJoinRuntime.h.

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2143 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2148  {
2149  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2150  CHECK(!join_column_per_key.empty());
2151 
2152  std::vector<std::future<void>> approx_distinct_threads;
2153  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2154  approx_distinct_threads.push_back(std::async(
2156  [&join_column_per_key,
2157  &type_info_per_key,
2158  b,
2159  hll_buffer_all_cpus,
2160  padded_size_bytes,
2161  thread_idx,
2162  thread_count] {
2163  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2164 
2165  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2166  false,
2167  &join_column_per_key[0],
2168  &type_info_per_key[0],
2169  nullptr,
2170  nullptr);
2172  nullptr,
2173  b,
2174  join_column_per_key[0].num_elems,
2175  &key_handler,
2176  thread_idx,
2177  thread_count);
2178  }));
2179  }
2180  for (auto& child : approx_distinct_threads) {
2181  child.get();
2182  }
2183 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 630 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

633  {
634  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
635  hll_buffer,
636  nullptr,
637  b,
638  num_elems,
639  key_handler);
640 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_overlaps ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 598 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

602  {
603  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
604  hll_buffer,
605  row_counts_buffer,
606  b,
607  num_elems,
608  key_handler);
609 
610  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
612  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
613 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 615 of file HashJoinRuntimeGpu.cu.

References inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

621  {
622  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x>>>(
623  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
624 
625  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
627  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
628 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2185 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

2193  {
2194  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2195  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2196  CHECK(!join_column_per_key.empty());
2197 
2198  std::vector<std::future<void>> approx_distinct_threads;
2199  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2200  approx_distinct_threads.push_back(std::async(
2202  [&join_column_per_key,
2203  &join_buckets_per_key,
2204  &row_counts,
2205  b,
2206  hll_buffer_all_cpus,
2207  padded_size_bytes,
2208  thread_idx,
2209  thread_count] {
2210  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2211 
2212  const auto key_handler = OverlapsKeyHandler(
2213  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2214  &join_column_per_key[0],
2215  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2217  row_counts.data(),
2218  b,
2219  join_column_per_key[0].num_elems,
2220  &key_handler,
2221  thread_idx,
2222  thread_count);
2223  }));
2224  }
2225  for (auto& child : approx_distinct_threads) {
2226  child.get();
2227  }
2228 
2230  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2231 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2233 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2242  {
2243  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2244  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2245  CHECK(!join_column_per_key.empty());
2246 
2247  std::vector<std::future<void>> approx_distinct_threads;
2248  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2249  approx_distinct_threads.push_back(std::async(
2251  [&join_column_per_key,
2252  &join_buckets_per_key,
2253  &row_counts,
2254  b,
2255  hll_buffer_all_cpus,
2256  padded_size_bytes,
2257  thread_idx,
2258  is_compressed,
2259  thread_count] {
2260  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2261 
2262  const auto key_handler = RangeKeyHandler(
2263  is_compressed,
2264  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2265  &join_column_per_key[0],
2266  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2268  row_counts.data(),
2269  b,
2270  join_column_per_key[0].num_elems,
2271  &key_handler,
2272  thread_idx,
2273  thread_count);
2274  }));
2275  }
2276  for (auto& child : approx_distinct_threads) {
2277  child.get();
2278  }
2279 
2281  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2282 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2284 of file HashJoinRuntime.cpp.

References threading_serial::async(), and i.

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

2288  {
2289  std::vector<std::vector<double>> bucket_sizes_for_threads;
2290  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2291  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2292  }
2293  std::vector<std::future<void>> threads;
2294  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2295  threads.push_back(std::async(std::launch::async,
2296  compute_bucket_sizes_impl<2>,
2297  bucket_sizes_for_threads[thread_idx].data(),
2298  &join_column,
2299  &type_info,
2300  bucket_size_thresholds.data(),
2301  thread_idx,
2302  thread_count));
2303  }
2304  for (auto& child : threads) {
2305  child.get();
2306  }
2307 
2308  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2309  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2310  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2311  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2312  }
2313  }
2314  }
2315 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds 
)

Definition at line 642 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

645  {
646  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
647  bucket_sizes_buffer,
648  join_column,
649  type_info,
650  bucket_sz_threshold);
651 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1742 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1751  {
1752  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1753  entry_count,
1754  invalid_slot_val,
1755  for_semi_join,
1756  key_component_count,
1757  with_val_slot,
1758  key_handler,
1759  num_elems,
1760  cpu_thread_idx,
1761  cpu_thread_count);
1762 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1806 of file HashJoinRuntime.cpp.

1815  {
1816  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1817  entry_count,
1818  invalid_slot_val,
1819  for_semi_join,
1820  key_component_count,
1821  with_val_slot,
1822  key_handler,
1823  num_elems,
1824  cpu_thread_idx,
1825  cpu_thread_count);
1826 }
void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 450 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

458  {
460  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
461  hash_buff,
462  entry_count,
463  invalid_slot_val,
464  for_semi_join,
465  key_component_count,
466  with_val_slot,
467  dev_err_buff,
468  key_handler,
469  num_elems);
470 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 472 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

480  {
482  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
483  hash_buff,
484  entry_count,
485  invalid_slot_val,
486  for_semi_join,
487  key_component_count,
488  with_val_slot,
489  dev_err_buff,
490  key_handler,
491  num_elems);
492 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

int fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner,
const void *  sd_outer,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 197 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

205  {
206  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
207  : SUFFIX(fill_one_to_one_hashtable);
208  auto hashtable_filling_func = [&](auto elem, size_t index) {
209  auto entry_ptr = SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
210  return filling_func(index, entry_ptr, invalid_slot_val);
211  };
212 
213  return fill_hash_join_buff_impl(buff,
214  invalid_slot_val,
215  join_column,
216  type_info,
217  sd_inner_proxy,
218  sd_outer_proxy,
219  cpu_thread_idx,
220  cpu_thread_count,
221  hashtable_filling_func);
222 }
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:55
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:75
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:45

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const void *  sd_inner,
const void *  sd_outer,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 168 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

177  {
178  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
179  : SUFFIX(fill_one_to_one_hashtable);
180  auto hashtable_filling_func = [&](auto elem, size_t index) {
181  auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(
182  buff, elem, type_info.min_val, bucket_normalization);
183  return filling_func(index, entry_ptr, invalid_slot_val);
184  };
185 
186  return fill_hash_join_buff_impl(buff,
187  invalid_slot_val,
188  join_column,
189  type_info,
190  sd_inner_proxy,
191  sd_outer_proxy,
192  cpu_thread_idx,
193  cpu_thread_count,
194  hashtable_filling_func);
195 }
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:67
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:55
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:45

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info 
)

Definition at line 81 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

86  {
88  buff,
89  invalid_slot_val,
90  for_semi_join,
91  join_column,
92  type_info,
93  dev_err_buff);
94 }
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int64_t  bucket_normalization 
)

Definition at line 64 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

70  {
72  buff,
73  invalid_slot_val,
74  for_semi_join,
75  join_column,
76  type_info,
77  dev_err_buff,
78  bucket_normalization);
79 }
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info 
)

Definition at line 159 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

165  {
167  buff,
168  invalid_slot_val,
169  for_semi_join,
170  join_column,
171  type_info,
172  shard_info,
173  dev_err_buff);
174 }
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int64_t  bucket_normalization 
)

Definition at line 139 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

147  {
149  buff,
150  invalid_slot_val,
151  for_semi_join,
152  join_column,
153  type_info,
154  shard_info,
155  dev_err_buff,
156  bucket_normalization);
157 }
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count,
const bool  is_range_join = false,
const bool  is_geo_compressed = false 
)

Definition at line 2085 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2098  {
2099  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2100  composite_key_dict,
2101  hash_entry_count,
2102  invalid_slot_val,
2103  key_component_count,
2104  join_column_per_key,
2105  type_info_per_key,
2106  join_bucket_info,
2107  sd_inner_proxy_per_key,
2108  sd_outer_proxy_per_key,
2109  cpu_thread_count,
2110  is_range_join,
2111  is_geo_compressed);
2112 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const void * > &  sd_inner_proxy_per_key,
const std::vector< const void * > &  sd_outer_proxy_per_key,
const int32_t  cpu_thread_count,
const bool  is_range_join = false,
const bool  is_geo_compressed = false 
)

Definition at line 2114 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2127  {
2128  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2129  composite_key_dict,
2130  hash_entry_count,
2131  invalid_slot_val,
2132  key_component_count,
2133  join_column_per_key,
2134  type_info_per_key,
2135  join_bucket_info,
2136  sd_inner_proxy_per_key,
2137  sd_outer_proxy_per_key,
2138  cpu_thread_count,
2139  is_range_join,
2140  is_geo_compressed);
2141 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 537 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

544  {
545  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
546  composite_key_dict,
547  hash_entry_count,
548  invalid_slot_val,
549  key_handler,
550  num_elems);
551 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 553 of file HashJoinRuntimeGpu.cu.

559  {
560  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
561  composite_key_dict,
562  hash_entry_count,
563  invalid_slot_val,
564  key_handler,
565  num_elems);
566 }
void fill_one_to_many_hash_table ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1448 of file HashJoinRuntime.cpp.

References count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1455  {
1456  auto timer = DEBUG_TIMER(__func__);
1457  auto launch_count_matches = [count_buff = buff + hash_entry_info.hash_entry_count,
1458  invalid_slot_val,
1459  &join_column,
1460  &type_info,
1461  sd_inner_proxy,
1462  sd_outer_proxy](auto cpu_thread_idx,
1463  auto cpu_thread_count) {
1465  (count_buff,
1466  invalid_slot_val,
1467  join_column,
1468  type_info,
1469  sd_inner_proxy,
1470  sd_outer_proxy,
1471  cpu_thread_idx,
1472  cpu_thread_count);
1473  };
1474  auto launch_fill_row_ids = [hash_entry_count = hash_entry_info.hash_entry_count,
1475  buff,
1476  invalid_slot_val,
1477  &join_column,
1478  &type_info,
1479  sd_inner_proxy,
1480  sd_outer_proxy](auto cpu_thread_idx,
1481  auto cpu_thread_count) {
1483  (buff,
1484  hash_entry_count,
1485  invalid_slot_val,
1486  join_column,
1487  type_info,
1488  sd_inner_proxy,
1489  sd_outer_proxy,
1490  cpu_thread_idx,
1491  cpu_thread_count);
1492  };
1493 
1495  hash_entry_info.hash_entry_count,
1496  invalid_slot_val,
1497  join_column,
1498  type_info,
1499  sd_inner_proxy,
1500  sd_outer_proxy,
1501  cpu_thread_count,
1502  launch_count_matches,
1503  launch_fill_row_ids);
1504 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
size_t hash_entry_count
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define DEBUG_TIMER(name)
Definition: Logger.h:352

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)

Definition at line 1506 of file HashJoinRuntime.cpp.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1513  {
1514  auto timer = DEBUG_TIMER(__func__);
1515  auto bucket_normalization = hash_entry_info.bucket_normalization;
1516  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1517  auto launch_count_matches = [bucket_normalization,
1518  count_buff = buff + hash_entry_count,
1519  invalid_slot_val,
1520  &join_column,
1521  &type_info,
1522  sd_inner_proxy,
1523  sd_outer_proxy](auto cpu_thread_idx,
1524  auto cpu_thread_count) {
1526  (count_buff,
1527  invalid_slot_val,
1528  join_column,
1529  type_info,
1530  sd_inner_proxy,
1531  sd_outer_proxy,
1532  cpu_thread_idx,
1533  cpu_thread_count,
1534  bucket_normalization);
1535  };
1536  auto launch_fill_row_ids = [bucket_normalization,
1537  hash_entry_count,
1538  buff,
1539  invalid_slot_val,
1540  &join_column,
1541  &type_info,
1542  sd_inner_proxy,
1543  sd_outer_proxy](auto cpu_thread_idx,
1544  auto cpu_thread_count) {
1546  (buff,
1547  hash_entry_count,
1548  invalid_slot_val,
1549  join_column,
1550  type_info,
1551  sd_inner_proxy,
1552  sd_outer_proxy,
1553  cpu_thread_idx,
1554  cpu_thread_count,
1555  bucket_normalization);
1556  };
1557 
1559  hash_entry_count,
1560  invalid_slot_val,
1561  join_column,
1562  type_info,
1563  sd_inner_proxy,
1564  sd_outer_proxy,
1565  cpu_thread_count,
1566  launch_count_matches,
1567  launch_fill_row_ids);
1568 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
#define SUFFIX(name)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const
#define DEBUG_TIMER(name)
Definition: Logger.h:352

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 239 of file HashJoinRuntimeGpu.cu.

References count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

243  {
244  auto hash_entry_count = hash_entry_info.hash_entry_count;
245  auto count_matches_func = [hash_entry_count,
246  count_buff = buff + hash_entry_count,
247  invalid_slot_val,
248  join_column,
249  type_info] {
251  SUFFIX(count_matches), count_buff, invalid_slot_val, join_column, type_info);
252  };
253 
254  auto fill_row_ids_func =
255  [buff, hash_entry_count, invalid_slot_val, join_column, type_info] {
257  buff,
258  hash_entry_count,
259  invalid_slot_val,
260  join_column,
261  type_info);
262  };
263 
265  hash_entry_count,
266  invalid_slot_val,
267  join_column,
268  type_info,
269  count_matches_func,
270  fill_row_ids_func);
271 }
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
size_t hash_entry_count
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 273 of file HashJoinRuntimeGpu.cu.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

278  {
279  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
280  auto count_matches_func = [count_buff = buff + hash_entry_count,
281  invalid_slot_val,
282  join_column,
283  type_info,
284  bucket_normalization =
285  hash_entry_info.bucket_normalization] {
287  count_buff,
288  invalid_slot_val,
289  join_column,
290  type_info,
291  bucket_normalization);
292  };
293 
294  auto fill_row_ids_func = [buff,
295  hash_entry_count =
296  hash_entry_info.getNormalizedHashEntryCount(),
297  invalid_slot_val,
298  join_column,
299  type_info,
300  bucket_normalization = hash_entry_info.bucket_normalization] {
302  buff,
303  hash_entry_count,
304  invalid_slot_val,
305  join_column,
306  type_info,
307  bucket_normalization);
308  };
309 
311  hash_entry_count,
312  invalid_slot_val,
313  join_column,
314  type_info,
315  count_matches_func,
316  fill_row_ids_func);
317 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
size_t getNormalizedHashEntryCount() const

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info 
)

Definition at line 319 of file HashJoinRuntimeGpu.cu.

References count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), HashEntryInfo::hash_entry_count, inclusive_scan(), set_valid_pos(), set_valid_pos_flag(), and SUFFIX.

324  {
325  auto hash_entry_count = hash_entry_info.hash_entry_count;
326  int32_t* pos_buff = buff;
327  int32_t* count_buff = buff + hash_entry_count;
328  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
330  count_buff,
331  invalid_slot_val,
332  join_column,
333  type_info,
334  shard_info);
335 
336  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
337 
338  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
340  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
341  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
342  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
344  buff,
345  hash_entry_count,
346  invalid_slot_val,
347  join_column,
348  type_info,
349  shard_info);
350 }
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
size_t hash_entry_count
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_sharded_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const void *  sd_inner_proxy,
const void *  sd_outer_proxy,
const unsigned  cpu_thread_count 
)
ColumnType get_join_column_type_kind ( const SQLTypeInfo ti)
inline

Definition at line 128 of file HashJoinRuntime.h.

References SQLTypeInfo::is_date_in_days(), is_unsigned_type(), Signed, SmallDate, and Unsigned.

Referenced by OverlapsJoinHashTable::fetchColumnsForDevice(), PerfectJoinHashTable::fetchColumnsForDevice(), BaselineJoinHashTable::fetchColumnsForDevice(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

128  {
129  if (ti.is_date_in_days()) {
130  return SmallDate;
131  } else {
132  return is_unsigned_type(ti) ? Unsigned : Signed;
133  }
134 }
bool is_date_in_days() const
Definition: sqltypes.h:858
bool is_unsigned_type(const SQLTypeInfo &ti)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1710 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1716  {
1717  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1718  entry_count,
1719  key_component_count,
1720  with_val_slot,
1721  invalid_slot_val,
1722  cpu_thread_idx,
1723  cpu_thread_count);
1724 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1726 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1732  {
1733  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1734  entry_count,
1735  key_component_count,
1736  with_val_slot,
1737  invalid_slot_val,
1738  cpu_thread_idx,
1739  cpu_thread_count);
1740 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 401 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

405  {
406  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
407  hash_join_buff,
408  entry_count,
409  key_component_count,
410  with_val_slot,
411  invalid_slot_val);
412 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 414 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

418  {
419  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
420  hash_join_buff,
421  entry_count,
422  key_component_count,
423  with_val_slot,
424  invalid_slot_val);
425 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_hash_join_buff ( int32_t *  buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 92 of file HashJoinRuntime.cpp.

References i.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

96  {
97 #ifdef __CUDACC__
98  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
99  int32_t step = blockDim.x * gridDim.x;
100 #else
101  int32_t start = cpu_thread_idx;
102  int32_t step = cpu_thread_count;
103 #endif
104  for (int64_t i = start; i < hash_entry_count; i += step) {
105  groups_buffer[i] = invalid_slot_val;
106  }
107 }

+ Here is the caller graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 182 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

184  {
186  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
187 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1764 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1772  {
1773  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1774  entry_count,
1775  invalid_slot_val,
1776  false,
1777  key_component_count,
1778  with_val_slot,
1779  key_handler,
1780  num_elems,
1781  cpu_thread_idx,
1782  cpu_thread_count);
1783 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1828 of file HashJoinRuntime.cpp.

1836  {
1837  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1838  entry_count,
1839  invalid_slot_val,
1840  false,
1841  key_component_count,
1842  with_val_slot,
1843  key_handler,
1844  num_elems,
1845  cpu_thread_idx,
1846  cpu_thread_count);
1847 }
void overlaps_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 494 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

502  {
504  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
505  hash_buff,
506  entry_count,
507  invalid_slot_val,
508  false,
509  key_component_count,
510  with_val_slot,
511  dev_err_buff,
512  key_handler,
513  num_elems);
514 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 568 of file HashJoinRuntimeGpu.cu.

574  {
575  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
576  composite_key_dict,
577  hash_entry_count,
578  invalid_slot_val,
579  key_handler,
580  num_elems);
581 }
int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1785 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1793  {
1794  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1795  entry_count,
1796  invalid_slot_val,
1797  false,
1798  key_component_count,
1799  with_val_slot,
1800  key_handler,
1801  num_elems,
1802  cpu_thread_idx,
1803  cpu_thread_count);
1804 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1849 of file HashJoinRuntime.cpp.

1857  {
1858  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1859  entry_count,
1860  invalid_slot_val,
1861  false,
1862  key_component_count,
1863  with_val_slot,
1864  key_handler,
1865  num_elems,
1866  cpu_thread_idx,
1867  cpu_thread_count);
1868 }
void range_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 516 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

523  {
525  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
526  hash_buff,
527  entry_count,
528  invalid_slot_val,
529  false,
530  key_component_count,
531  with_val_slot,
532  dev_err_buff,
533  key_handler,
534  num_elems);
535 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const size_t  hash_entry_count,
const int32_t  invalid_slot_val,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 583 of file HashJoinRuntimeGpu.cu.

589  {
590  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
591  composite_key_dict,
592  hash_entry_count,
593  invalid_slot_val,
594  key_handler,
595  num_elems);
596 }

Variable Documentation