OmniSciDB  8fa3bf436f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HashJoinRuntimeGpu.cu File Reference
#include "HashJoinRuntime.cpp"
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
+ Include dependency graph for HashJoinRuntimeGpu.cu:

Go to the source code of this file.

Macros

#define checkCudaErrors(err)   CHECK_EQ(err, cudaSuccess)
 
#define VALID_POS_FLAG   0
 

Functions

template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper (F func, ARGS &&...args)
 
__global__ void fill_hash_join_buff_wrapper (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
 
__global__ void fill_hash_join_buff_bucketized_wrapper (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_bucketized (int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device (int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
 
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
 
__global__ void fill_hash_join_buff_wrapper_sharded (int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
 
void fill_hash_join_buff_on_device_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_sharded (int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
 
__global__ void init_hash_join_buff_wrapper (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
__global__ void set_valid_pos_flag (int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
 
__global__ void set_valid_pos (int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
 
template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table_on_device (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_sharded (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
 
template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *key_handler, const size_t num_elems)
 
template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_overlaps (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_sz_threshold)
 

Macro Definition Documentation

#define checkCudaErrors (   err)    CHECK_EQ(err, cudaSuccess)

Definition at line 21 of file HashJoinRuntimeGpu.cu.

Referenced by cuda_kernel_launch_wrapper().

#define VALID_POS_FLAG   0

Definition at line 167 of file HashJoinRuntimeGpu.cu.

Referenced by set_valid_pos(), and set_valid_pos_flag().

Function Documentation

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 550 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

553  {
554  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
555  hll_buffer,
556  nullptr,
557  b,
558  num_elems,
559  key_handler);
560 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_overlaps ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 533 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

537  {
538  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
539  hll_buffer,
540  row_counts_buffer,
541  b,
542  num_elems,
543  key_handler);
544 
545  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
547  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
548 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_sz_threshold 
)

Definition at line 562 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

565  {
566  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
567  bucket_sizes_buffer,
568  join_column,
569  type_info,
570  bucket_sz_threshold);
571 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper ( func,
ARGS &&...  args 
)
void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 426 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

433  {
435  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
436  hash_buff,
437  entry_count,
438  invalid_slot_val,
439  key_component_count,
440  with_val_slot,
441  dev_err_buff,
442  key_handler,
443  num_elems);
444 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 446 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

453  {
455  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
456  hash_buff,
457  entry_count,
458  invalid_slot_val,
459  key_component_count,
460  with_val_slot,
461  dev_err_buff,
462  key_handler,
463  num_elems);
464 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  err,
const KEY_HANDLER *  key_handler,
const int64_t  num_elems 
)

Definition at line 406 of file HashJoinRuntimeGpu.cu.

References fill_baseline_hash_join_buff(), SUFFIX, and omnisci.dtypes::T.

413  {
414  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
415  entry_count,
416  invalid_slot_val,
417  key_component_count,
418  with_val_slot,
419  key_handler,
420  num_elems,
421  -1,
422  -1);
423  atomicCAS(err, 0, partial_err);
424 }
#define SUFFIX(name)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_bucketized_wrapper ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
int *  err,
const int64_t  bucket_normalization 
)

Definition at line 42 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_bucketized().

48  {
49  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
50  invalid_slot_val,
51  join_column,
52  type_info,
53  NULL,
54  NULL,
55  -1,
56  -1,
57  bucket_normalization);
58  atomicCAS(err, 0, partial_err);
59 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( int32_t *  buff,
const int32_t  invalid_slot_val,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info 
)

Definition at line 76 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

80  {
82  buff,
83  invalid_slot_val,
84  join_column,
85  type_info,
86  dev_err_buff);
87 }
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int64_t  bucket_normalization 
)

Definition at line 61 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

66  {
68  buff,
69  invalid_slot_val,
70  join_column,
71  type_info,
72  dev_err_buff,
73  bucket_normalization);
74 }
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info 
)

Definition at line 139 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

144  {
146  buff,
147  invalid_slot_val,
148  join_column,
149  type_info,
150  shard_info,
151  dev_err_buff);
152 }
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int64_t  bucket_normalization 
)

Definition at line 121 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

128  {
130  buff,
131  invalid_slot_val,
132  join_column,
133  type_info,
134  shard_info,
135  dev_err_buff,
136  bucket_normalization);
137 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_wrapper ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
int *  err 
)

Definition at line 32 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device().

36  {
37  int partial_err = SUFFIX(fill_hash_join_buff)(
38  buff, invalid_slot_val, join_column, type_info, NULL, NULL, -1, -1);
39  atomicCAS(err, 0, partial_err);
40 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
int *  err 
)

Definition at line 110 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_sharded(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_sharded().

115  {
116  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(
117  buff, invalid_slot_val, join_column, type_info, shard_info, NULL, NULL, -1, -1);
118  atomicCAS(err, 0, partial_err);
119 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
int *  err,
const int64_t  bucket_normalization 
)

Definition at line 89 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_sharded_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_sharded_bucketized().

96  {
97  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
98  invalid_slot_val,
99  join_column,
100  type_info,
101  shard_info,
102  NULL,
103  NULL,
104  -1,
105  -1,
106  bucket_normalization);
107  atomicCAS(err, 0, partial_err);
108 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const KEY_HANDLER *  key_handler,
const size_t  num_elems 
)

Definition at line 331 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

336  {
337  auto pos_buff = buff;
338  auto count_buff = buff + hash_entry_count;
339  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
340  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
341  count_buff,
342  composite_key_dict,
343  hash_entry_count,
344  key_handler,
345  num_elems);
346 
347  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
348 
349  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
351  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
352  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
353  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
354 
355  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
356  buff,
357  composite_key_dict,
358  hash_entry_count,
359  invalid_slot_val,
360  key_handler,
361  num_elems);
362 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 487 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

494  {
495  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
496  composite_key_dict,
497  hash_entry_count,
498  invalid_slot_val,
499  key_handler,
500  num_elems);
501 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 503 of file HashJoinRuntimeGpu.cu.

509  {
510  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
511  composite_key_dict,
512  hash_entry_count,
513  invalid_slot_val,
514  key_handler,
515  num_elems);
516 }
void fill_one_to_many_hash_table_on_device ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 217 of file HashJoinRuntimeGpu.cu.

References count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

221  {
222  auto hash_entry_count = hash_entry_info.hash_entry_count;
223  auto count_matches_func = [hash_entry_count,
224  count_buff = buff + hash_entry_count,
225  invalid_slot_val,
226  join_column,
227  type_info] {
229  SUFFIX(count_matches), count_buff, invalid_slot_val, join_column, type_info);
230  };
231 
232  auto fill_row_ids_func =
233  [buff, hash_entry_count, invalid_slot_val, join_column, type_info] {
235  buff,
236  hash_entry_count,
237  invalid_slot_val,
238  join_column,
239  type_info);
240  };
241 
243  hash_entry_count,
244  invalid_slot_val,
245  join_column,
246  type_info,
247  count_matches_func,
248  fill_row_ids_func);
249 }
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
size_t hash_entry_count
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 251 of file HashJoinRuntimeGpu.cu.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

256  {
257  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
258  auto count_matches_func = [count_buff = buff + hash_entry_count,
259  invalid_slot_val,
260  join_column,
261  type_info,
262  bucket_normalization =
263  hash_entry_info.bucket_normalization] {
265  count_buff,
266  invalid_slot_val,
267  join_column,
268  type_info,
269  bucket_normalization);
270  };
271 
272  auto fill_row_ids_func = [buff,
273  hash_entry_count =
274  hash_entry_info.getNormalizedHashEntryCount(),
275  invalid_slot_val,
276  join_column,
277  type_info,
278  bucket_normalization = hash_entry_info.bucket_normalization] {
280  buff,
281  hash_entry_count,
282  invalid_slot_val,
283  join_column,
284  type_info,
285  bucket_normalization);
286  };
287 
289  hash_entry_count,
290  invalid_slot_val,
291  join_column,
292  type_info,
293  count_matches_func,
294  fill_row_ids_func);
295 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
size_t getNormalizedHashEntryCount() const

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
COUNT_MATCHES_FUNCTOR  count_matches_func,
FILL_ROW_IDS_FUNCTOR  fill_row_ids_func 
)

Definition at line 194 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

Referenced by fill_one_to_many_hash_table_on_device(), and fill_one_to_many_hash_table_on_device_bucketized().

200  {
201  int32_t* pos_buff = buff;
202  int32_t* count_buff = buff + hash_entry_count;
203  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
204  count_matches_func();
205 
206  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
207 
208  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
210  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
211 
212  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
213  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
214  fill_row_ids_func();
215 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info 
)

Definition at line 297 of file HashJoinRuntimeGpu.cu.

References count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), HashEntryInfo::hash_entry_count, inclusive_scan(), set_valid_pos(), set_valid_pos_flag(), and SUFFIX.

302  {
303  auto hash_entry_count = hash_entry_info.hash_entry_count;
304  int32_t* pos_buff = buff;
305  int32_t* count_buff = buff + hash_entry_count;
306  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
308  count_buff,
309  invalid_slot_val,
310  join_column,
311  type_info,
312  shard_info);
313 
314  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
315 
316  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
318  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
319  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
320  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
322  buff,
323  hash_entry_count,
324  invalid_slot_val,
325  join_column,
326  type_info,
327  shard_info);
328 }
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
size_t hash_entry_count
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 379 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

383  {
384  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
385  hash_join_buff,
386  entry_count,
387  key_component_count,
388  with_val_slot,
389  invalid_slot_val);
390 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 392 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

396  {
397  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
398  hash_join_buff,
399  entry_count,
400  key_component_count,
401  with_val_slot,
402  invalid_slot_val);
403 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 365 of file HashJoinRuntimeGpu.cu.

References init_baseline_hash_join_buff(), SUFFIX, and omnisci.dtypes::T.

369  {
370  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
371  entry_count,
372  key_component_count,
373  with_val_slot,
374  invalid_slot_val,
375  -1,
376  -1);
377 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 160 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

162  {
164  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
165 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void init_hash_join_buff_wrapper ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 154 of file HashJoinRuntimeGpu.cu.

References init_hash_join_buff(), and SUFFIX.

Referenced by init_hash_join_buff_on_device().

156  {
157  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
158 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void overlaps_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 466 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

474  {
476  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
477  hash_buff,
478  entry_count,
479  invalid_slot_val,
480  key_component_count,
481  with_val_slot,
482  dev_err_buff,
483  key_handler,
484  num_elems);
485 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 518 of file HashJoinRuntimeGpu.cu.

524  {
525  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
526  composite_key_dict,
527  hash_entry_count,
528  invalid_slot_val,
529  key_handler,
530  num_elems);
531 }
__global__ void set_valid_pos ( int32_t *  pos_buff,
int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 181 of file HashJoinRuntimeGpu.cu.

References i, and VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

183  {
184  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
185  const int32_t step = blockDim.x * gridDim.x;
186  for (int64_t i = start; i < entry_count; i += step) {
187  if (VALID_POS_FLAG == pos_buff[i]) {
188  pos_buff[i] = !i ? 0 : count_buff[i - 1];
189  }
190  }
191 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function:

__global__ void set_valid_pos_flag ( int32_t *  pos_buff,
const int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 169 of file HashJoinRuntimeGpu.cu.

References i, and VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

171  {
172  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
173  const int32_t step = blockDim.x * gridDim.x;
174  for (int64_t i = start; i < entry_count; i += step) {
175  if (count_buff[i]) {
176  pos_buff[i] = VALID_POS_FLAG;
177  }
178  }
179 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function: