OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntimeGpu.cu File Reference
#include "HashJoinRuntime.cpp"
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
+ Include dependency graph for HashJoinRuntimeGpu.cu:

Go to the source code of this file.

Macros

#define checkCudaErrors(err)   CHECK_EQ(err, cudaSuccess)
 
#define VALID_POS_FLAG   0
 

Functions

template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper (F func, ARGS &&...args)
 
__global__ void fill_hash_join_buff_wrapper (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
 
__global__ void fill_hash_join_buff_bucketized_wrapper (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
 
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
 
__global__ void fill_hash_join_buff_wrapper_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
 
void fill_hash_join_buff_on_device_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
 
__global__ void init_hash_join_buff_wrapper (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
__global__ void set_valid_pos_flag (int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
 
__global__ void set_valid_pos (int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
 
template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table_on_device (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_sharded (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
 
template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *key_handler, const size_t num_elems)
 
template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void approximate_distinct_tuples_on_device_overlaps (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_range (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_sz_threshold)
 

Macro Definition Documentation

#define checkCudaErrors (   err)    CHECK_EQ(err, cudaSuccess)

Definition at line 21 of file HashJoinRuntimeGpu.cu.

Referenced by cuda_kernel_launch_wrapper().

#define VALID_POS_FLAG   0

Definition at line 189 of file HashJoinRuntimeGpu.cu.

Referenced by set_valid_pos(), and set_valid_pos_flag().

Function Documentation

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 630 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

633  {
634  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
635  hll_buffer,
636  nullptr,
637  b,
638  num_elems,
639  key_handler);
640 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_overlaps ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 598 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

602  {
603  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
604  hll_buffer,
605  row_counts_buffer,
606  b,
607  num_elems,
608  key_handler);
609 
610  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
612  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
613 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 615 of file HashJoinRuntimeGpu.cu.

References inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

621  {
622  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x>>>(
623  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
624 
625  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
627  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
628 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_sz_threshold 
)

Definition at line 642 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

645  {
646  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
647  bucket_sizes_buffer,
648  join_column,
649  type_info,
650  bucket_sz_threshold);
651 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper ( func,
ARGS &&...  args 
)

Definition at line 24 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, and checkCudaErrors.

Referenced by approximate_distinct_tuples_on_device(), approximate_distinct_tuples_on_device_overlaps(), compute_bucket_sizes_on_device(), fill_baseline_hash_join_buff_on_device_32(), fill_baseline_hash_join_buff_on_device_64(), fill_hash_join_buff_on_device(), fill_hash_join_buff_on_device_bucketized(), fill_hash_join_buff_on_device_sharded(), fill_hash_join_buff_on_device_sharded_bucketized(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device(), fill_one_to_many_hash_table_on_device_bucketized(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), init_baseline_hash_join_buff_on_device_32(), init_baseline_hash_join_buff_on_device_64(), init_hash_join_buff_on_device(), overlaps_fill_baseline_hash_join_buff_on_device_64(), and range_fill_baseline_hash_join_buff_on_device_64().

24  {
25  int grid_size = -1;
26  int block_size = -1;
27  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
28  func<<<grid_size, block_size>>>(std::forward<ARGS>(args)...);
29  checkCudaErrors(cudaGetLastError());
30 }
#define checkCudaErrors(err)

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 450 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

458  {
460  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
461  hash_buff,
462  entry_count,
463  invalid_slot_val,
464  for_semi_join,
465  key_component_count,
466  with_val_slot,
467  dev_err_buff,
468  key_handler,
469  num_elems);
470 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 472 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

480  {
482  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
483  hash_buff,
484  entry_count,
485  invalid_slot_val,
486  for_semi_join,
487  key_component_count,
488  with_val_slot,
489  dev_err_buff,
490  key_handler,
491  num_elems);
492 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  err,
const KEY_HANDLER *  key_handler,
const int64_t  num_elems 
)

Definition at line 428 of file HashJoinRuntimeGpu.cu.

References fill_baseline_hash_join_buff(), SUFFIX, and omnisci.dtypes::T.

436  {
437  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
438  entry_count,
439  invalid_slot_val,
440  for_semi_join,
441  key_component_count,
442  with_val_slot,
443  key_handler,
444  num_elems,
445  -1,
446  -1);
447  atomicCAS(err, 0, partial_err);
448 }
#define SUFFIX(name)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_bucketized_wrapper ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
int *  err,
const int64_t  bucket_normalization 
)

Definition at line 43 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_bucketized().

50  {
51  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
52  invalid_slot_val,
53  for_semi_join,
54  join_column,
55  type_info,
56  NULL,
57  NULL,
58  -1,
59  -1,
60  bucket_normalization);
61  atomicCAS(err, 0, partial_err);
62 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info 
)

Definition at line 81 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

86  {
88  buff,
89  invalid_slot_val,
90  for_semi_join,
91  join_column,
92  type_info,
93  dev_err_buff);
94 }
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int64_t  bucket_normalization 
)

Definition at line 64 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

70  {
72  buff,
73  invalid_slot_val,
74  for_semi_join,
75  join_column,
76  type_info,
77  dev_err_buff,
78  bucket_normalization);
79 }
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info 
)

Definition at line 159 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

165  {
167  buff,
168  invalid_slot_val,
169  for_semi_join,
170  join_column,
171  type_info,
172  shard_info,
173  dev_err_buff);
174 }
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int64_t  bucket_normalization 
)

Definition at line 139 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

147  {
149  buff,
150  invalid_slot_val,
151  for_semi_join,
152  join_column,
153  type_info,
154  shard_info,
155  dev_err_buff,
156  bucket_normalization);
157 }
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_wrapper ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
int *  err 
)

Definition at line 32 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device().

37  {
38  int partial_err = SUFFIX(fill_hash_join_buff)(
39  buff, invalid_slot_val, for_semi_join, join_column, type_info, NULL, NULL, -1, -1);
40  atomicCAS(err, 0, partial_err);
41 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
int *  err 
)

Definition at line 119 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_sharded(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_sharded().

125  {
126  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(buff,
127  invalid_slot_val,
128  for_semi_join,
129  join_column,
130  type_info,
131  shard_info,
132  NULL,
133  NULL,
134  -1,
135  -1);
136  atomicCAS(err, 0, partial_err);
137 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
int *  err,
const int64_t  bucket_normalization 
)

Definition at line 96 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_sharded_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_sharded_bucketized().

104  {
105  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
106  invalid_slot_val,
107  for_semi_join,
108  join_column,
109  type_info,
110  shard_info,
111  NULL,
112  NULL,
113  -1,
114  -1,
115  bucket_normalization);
116  atomicCAS(err, 0, partial_err);
117 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const KEY_HANDLER *  key_handler,
const size_t  num_elems 
)

Definition at line 353 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

358  {
359  auto pos_buff = buff;
360  auto count_buff = buff + hash_entry_count;
361  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
362  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
363  count_buff,
364  composite_key_dict,
365  hash_entry_count,
366  key_handler,
367  num_elems);
368 
369  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
370 
371  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
373  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
374  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
375  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
376 
377  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
378  buff,
379  composite_key_dict,
380  hash_entry_count,
381  invalid_slot_val,
382  key_handler,
383  num_elems);
384 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 537 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

544  {
545  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
546  composite_key_dict,
547  hash_entry_count,
548  invalid_slot_val,
549  key_handler,
550  num_elems);
551 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 553 of file HashJoinRuntimeGpu.cu.

559  {
560  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
561  composite_key_dict,
562  hash_entry_count,
563  invalid_slot_val,
564  key_handler,
565  num_elems);
566 }
void fill_one_to_many_hash_table_on_device ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 239 of file HashJoinRuntimeGpu.cu.

References count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

243  {
244  auto hash_entry_count = hash_entry_info.hash_entry_count;
245  auto count_matches_func = [hash_entry_count,
246  count_buff = buff + hash_entry_count,
247  invalid_slot_val,
248  join_column,
249  type_info] {
251  SUFFIX(count_matches), count_buff, invalid_slot_val, join_column, type_info);
252  };
253 
254  auto fill_row_ids_func =
255  [buff, hash_entry_count, invalid_slot_val, join_column, type_info] {
257  buff,
258  hash_entry_count,
259  invalid_slot_val,
260  join_column,
261  type_info);
262  };
263 
265  hash_entry_count,
266  invalid_slot_val,
267  join_column,
268  type_info,
269  count_matches_func,
270  fill_row_ids_func);
271 }
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
size_t hash_entry_count
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 273 of file HashJoinRuntimeGpu.cu.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

278  {
279  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
280  auto count_matches_func = [count_buff = buff + hash_entry_count,
281  invalid_slot_val,
282  join_column,
283  type_info,
284  bucket_normalization =
285  hash_entry_info.bucket_normalization] {
287  count_buff,
288  invalid_slot_val,
289  join_column,
290  type_info,
291  bucket_normalization);
292  };
293 
294  auto fill_row_ids_func = [buff,
295  hash_entry_count =
296  hash_entry_info.getNormalizedHashEntryCount(),
297  invalid_slot_val,
298  join_column,
299  type_info,
300  bucket_normalization = hash_entry_info.bucket_normalization] {
302  buff,
303  hash_entry_count,
304  invalid_slot_val,
305  join_column,
306  type_info,
307  bucket_normalization);
308  };
309 
311  hash_entry_count,
312  invalid_slot_val,
313  join_column,
314  type_info,
315  count_matches_func,
316  fill_row_ids_func);
317 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
size_t getNormalizedHashEntryCount() const

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
COUNT_MATCHES_FUNCTOR  count_matches_func,
FILL_ROW_IDS_FUNCTOR  fill_row_ids_func 
)

Definition at line 216 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

Referenced by fill_one_to_many_hash_table_on_device(), and fill_one_to_many_hash_table_on_device_bucketized().

222  {
223  int32_t* pos_buff = buff;
224  int32_t* count_buff = buff + hash_entry_count;
225  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
226  count_matches_func();
227 
228  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
229 
230  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
232  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
233 
234  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
235  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
236  fill_row_ids_func();
237 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info 
)

Definition at line 319 of file HashJoinRuntimeGpu.cu.

References count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), HashEntryInfo::hash_entry_count, inclusive_scan(), set_valid_pos(), set_valid_pos_flag(), and SUFFIX.

324  {
325  auto hash_entry_count = hash_entry_info.hash_entry_count;
326  int32_t* pos_buff = buff;
327  int32_t* count_buff = buff + hash_entry_count;
328  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
330  count_buff,
331  invalid_slot_val,
332  join_column,
333  type_info,
334  shard_info);
335 
336  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
337 
338  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
340  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
341  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
342  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
344  buff,
345  hash_entry_count,
346  invalid_slot_val,
347  join_column,
348  type_info,
349  shard_info);
350 }
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
size_t hash_entry_count
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 401 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

405  {
406  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
407  hash_join_buff,
408  entry_count,
409  key_component_count,
410  with_val_slot,
411  invalid_slot_val);
412 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 414 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

418  {
419  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
420  hash_join_buff,
421  entry_count,
422  key_component_count,
423  with_val_slot,
424  invalid_slot_val);
425 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 387 of file HashJoinRuntimeGpu.cu.

References init_baseline_hash_join_buff(), SUFFIX, and omnisci.dtypes::T.

391  {
392  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
393  entry_count,
394  key_component_count,
395  with_val_slot,
396  invalid_slot_val,
397  -1,
398  -1);
399 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 182 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

184  {
186  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
187 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void init_hash_join_buff_wrapper ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 176 of file HashJoinRuntimeGpu.cu.

References init_hash_join_buff(), and SUFFIX.

Referenced by init_hash_join_buff_on_device().

178  {
179  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
180 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void overlaps_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 494 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

502  {
504  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
505  hash_buff,
506  entry_count,
507  invalid_slot_val,
508  false,
509  key_component_count,
510  with_val_slot,
511  dev_err_buff,
512  key_handler,
513  num_elems);
514 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 568 of file HashJoinRuntimeGpu.cu.

574  {
575  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
576  composite_key_dict,
577  hash_entry_count,
578  invalid_slot_val,
579  key_handler,
580  num_elems);
581 }
void range_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 516 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

523  {
525  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
526  hash_buff,
527  entry_count,
528  invalid_slot_val,
529  false,
530  key_component_count,
531  with_val_slot,
532  dev_err_buff,
533  key_handler,
534  num_elems);
535 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const size_t  hash_entry_count,
const int32_t  invalid_slot_val,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 583 of file HashJoinRuntimeGpu.cu.

589  {
590  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
591  composite_key_dict,
592  hash_entry_count,
593  invalid_slot_val,
594  key_handler,
595  num_elems);
596 }
__global__ void set_valid_pos ( int32_t *  pos_buff,
int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 203 of file HashJoinRuntimeGpu.cu.

References i, and VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

205  {
206  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
207  const int32_t step = blockDim.x * gridDim.x;
208  for (int64_t i = start; i < entry_count; i += step) {
209  if (VALID_POS_FLAG == pos_buff[i]) {
210  pos_buff[i] = !i ? 0 : count_buff[i - 1];
211  }
212  }
213 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function:

__global__ void set_valid_pos_flag ( int32_t *  pos_buff,
const int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 191 of file HashJoinRuntimeGpu.cu.

References i, and VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

193  {
194  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
195  const int32_t step = blockDim.x * gridDim.x;
196  for (int64_t i = start; i < entry_count; i += step) {
197  if (count_buff[i]) {
198  pos_buff[i] = VALID_POS_FLAG;
199  }
200  }
201 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function: