OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntimeGpu.cu File Reference
#include "HashJoinRuntime.cpp"
#include <cuda.h>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
+ Include dependency graph for HashJoinRuntimeGpu.cu:

Go to the source code of this file.

Macros

#define checkCudaErrors(err)   CHECK_EQ(err, cudaSuccess)
 
#define VALID_POS_FLAG   0
 

Functions

CUstream getQueryEngineCudaStream ()
 
template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper (F func, ARGS &&...args)
 
__global__ void fill_hash_join_buff_wrapper (OneToOnePerfectJoinHashTableFillFuncArgs const args)
 
__global__ void fill_hash_join_buff_bucketized_wrapper (OneToOnePerfectJoinHashTableFillFuncArgs const args)
 
void fill_hash_join_buff_on_device_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args)
 
void fill_hash_join_buff_on_device (OneToOnePerfectJoinHashTableFillFuncArgs const args)
 
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
 
__global__ void fill_hash_join_buff_wrapper_sharded (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
 
void fill_hash_join_buff_on_device_sharded_bucketized (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
 
void fill_hash_join_buff_on_device_sharded (OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
 
__global__ void init_hash_join_buff_wrapper (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
__global__ void set_valid_pos_flag (int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
 
__global__ void set_valid_pos (int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
 
template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table_on_device (OneToManyPerfectJoinHashTableFillFuncArgs const args)
 
void fill_one_to_many_hash_table_on_device_bucketized (OneToManyPerfectJoinHashTableFillFuncArgs const args)
 
void fill_one_to_many_hash_table_on_device_sharded (OneToManyPerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
 
template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *key_handler, const size_t num_elems, const bool for_window_framing)
 
template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void bbox_intersect_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
 
void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void approximate_distinct_tuples_on_device_bbox_intersect (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_range (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_sz_threshold)
 

Macro Definition Documentation

#define checkCudaErrors (   err)    CHECK_EQ(err, cudaSuccess)

Definition at line 25 of file HashJoinRuntimeGpu.cu.

#define VALID_POS_FLAG   0

Definition at line 121 of file HashJoinRuntimeGpu.cu.

Referenced by set_valid_pos(), and set_valid_pos_flag().

Function Documentation

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 537 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

540  {
541  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
542  hll_buffer,
543  nullptr,
544  b,
545  num_elems,
546  key_handler);
547 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_bbox_intersect ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 501 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by BoundingBoxIntersectJoinHashTable::approximateTupleCount().

506  {
508  approximate_distinct_tuples_impl_gpu<BoundingBoxIntersectKeyHandler>,
509  hll_buffer,
510  row_counts_buffer,
511  b,
512  num_elems,
513  key_handler);
514 
515  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
517  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
518 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 520 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, getQueryEngineCudaStream(), and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

526  {
527  auto qe_cuda_stream = getQueryEngineCudaStream();
528  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
529  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
530  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
531 
532  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
534  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
535 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void bbox_intersect_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 406 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_baseline_hash_join_buff_wrapper().

414  {
416  fill_baseline_hash_join_buff_wrapper<unsigned long long,
418  hash_buff,
419  entry_count,
420  invalid_slot_val,
421  false,
422  key_component_count,
423  with_val_slot,
424  dev_err_buff,
425  key_handler,
426  num_elems);
427 }
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const BoundingBoxIntersectKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 481 of file HashJoinRuntimeGpu.cu.

486  {
487  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
488  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
489 }
void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_sz_threshold 
)

Definition at line 549 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{BoundingBoxIntersectJoinHashTable.cpp}::compute_bucket_sizes().

552  {
553  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
554  bucket_sizes_buffer,
555  join_column,
556  type_info,
557  bucket_sz_threshold);
558 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper ( func,
ARGS &&...  args 
)

Definition at line 28 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, checkCudaErrors, and getQueryEngineCudaStream().

Referenced by approximate_distinct_tuples_on_device(), approximate_distinct_tuples_on_device_bbox_intersect(), bbox_intersect_fill_baseline_hash_join_buff_on_device_64(), compute_bucket_sizes_on_device(), fill_baseline_hash_join_buff_on_device_32(), fill_baseline_hash_join_buff_on_device_64(), fill_hash_join_buff_on_device(), fill_hash_join_buff_on_device_bucketized(), fill_hash_join_buff_on_device_sharded(), fill_hash_join_buff_on_device_sharded_bucketized(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device(), fill_one_to_many_hash_table_on_device_bucketized(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), init_baseline_hash_join_buff_on_device_32(), init_baseline_hash_join_buff_on_device_64(), init_hash_join_buff_on_device(), and range_fill_baseline_hash_join_buff_on_device_64().

28  {
29  int grid_size = -1;
30  int block_size = -1;
31  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
32  auto qe_cuda_stream = getQueryEngineCudaStream();
33  func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(args)...);
34  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
35 }
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 362 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

370  {
372  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
373  hash_buff,
374  entry_count,
375  invalid_slot_val,
376  for_semi_join,
377  key_component_count,
378  with_val_slot,
379  dev_err_buff,
380  key_handler,
381  num_elems);
382 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 384 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

392  {
394  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
395  hash_buff,
396  entry_count,
397  invalid_slot_val,
398  for_semi_join,
399  key_component_count,
400  with_val_slot,
401  dev_err_buff,
402  key_handler,
403  num_elems);
404 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  err,
const KEY_HANDLER *  key_handler,
const int64_t  num_elems 
)

Definition at line 340 of file HashJoinRuntimeGpu.cu.

References fill_baseline_hash_join_buff(), SUFFIX, and heavydb.dtypes::T.

Referenced by bbox_intersect_fill_baseline_hash_join_buff_on_device_64().

348  {
349  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
350  entry_count,
351  invalid_slot_val,
352  for_semi_join,
353  key_component_count,
354  with_val_slot,
355  key_handler,
356  num_elems,
357  -1,
358  -1);
359  atomicCAS(err, 0, partial_err);
360 }
#define SUFFIX(name)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_bucketized_wrapper ( OneToOnePerfectJoinHashTableFillFuncArgs const  args)

Definition at line 46 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff, fill_hash_join_buff_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_bucketized().

47  {
48  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(args, -1, -1);
49  atomicCAS(args.dev_err_buff, 0, partial_err);
50 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( OneToOnePerfectJoinHashTableFillFuncArgs const  args)

Definition at line 57 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

57  {
59 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void fill_hash_join_buff_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( OneToOnePerfectJoinHashTableFillFuncArgs const  args)

Definition at line 52 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

53  {
55 }
__global__ void fill_hash_join_buff_bucketized_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
ShardInfo const  shard_info 
)

Definition at line 102 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

104  {
106 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void fill_hash_join_buff_wrapper_sharded(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
ShardInfo const  shard_info 
)

Definition at line 95 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

97  {
100 }
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_wrapper ( OneToOnePerfectJoinHashTableFillFuncArgs const  args)

Definition at line 37 of file HashJoinRuntimeGpu.cu.

References OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff, fill_hash_join_buff(), fill_hash_join_buff_bitwise_eq(), SUFFIX, OneToOnePerfectJoinHashTableFillFuncArgs::type_info, and JoinColumnTypeInfo::uses_bw_eq.

Referenced by fill_hash_join_buff_on_device().

38  {
39  auto fill_hash_join_buff_func = args.type_info.uses_bw_eq
42  int partial_err = fill_hash_join_buff_func(args, -1, -1);
43  atomicCAS(args.dev_err_buff, 0, partial_err);
44 }
DEVICE int SUFFIX() fill_hash_join_buff_bitwise_eq(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
ShardInfo const  shard_info 
)

Definition at line 79 of file HashJoinRuntimeGpu.cu.

References OneToOnePerfectJoinHashTableFillFuncArgs::buff, OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff, fill_hash_join_buff_sharded(), OneToOnePerfectJoinHashTableFillFuncArgs::for_semi_join, OneToOnePerfectJoinHashTableFillFuncArgs::invalid_slot_val, OneToOnePerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToOnePerfectJoinHashTableFillFuncArgs::type_info.

Referenced by fill_hash_join_buff_on_device_sharded().

81  {
82  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(args.buff,
83  args.invalid_slot_val,
84  args.for_semi_join,
85  args.join_column,
86  args.type_info,
87  shard_info,
88  NULL,
89  NULL,
90  -1,
91  -1);
92  atomicCAS(args.dev_err_buff, 0, partial_err);
93 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded_bucketized ( OneToOnePerfectJoinHashTableFillFuncArgs const  args,
ShardInfo const  shard_info 
)

Definition at line 61 of file HashJoinRuntimeGpu.cu.

References OneToOnePerfectJoinHashTableFillFuncArgs::bucket_normalization, OneToOnePerfectJoinHashTableFillFuncArgs::buff, OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff, fill_hash_join_buff_sharded_bucketized(), OneToOnePerfectJoinHashTableFillFuncArgs::for_semi_join, OneToOnePerfectJoinHashTableFillFuncArgs::invalid_slot_val, OneToOnePerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToOnePerfectJoinHashTableFillFuncArgs::type_info.

Referenced by fill_hash_join_buff_on_device_sharded_bucketized().

63  {
64  int partial_err =
66  args.invalid_slot_val,
67  args.for_semi_join,
68  args.join_column,
69  args.type_info,
70  shard_info,
71  NULL,
72  NULL,
73  -1,
74  -1,
75  args.bucket_normalization);
76  atomicCAS(args.dev_err_buff, 0, partial_err);
77 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const KEY_HANDLER *  key_handler,
const size_t  num_elems,
const bool  for_window_framing 
)

Definition at line 260 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, cuda_kernel_launch_wrapper(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

265  {
266  auto pos_buff = buff;
267  auto count_buff = buff + hash_entry_count;
268  auto qe_cuda_stream = getQueryEngineCudaStream();
270  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
271  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
272  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
273  count_buff,
274  composite_key_dict,
275  hash_entry_count,
276  key_handler,
277  num_elems);
278 
279  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
280 
281  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
283  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
284  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
286  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
287  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
288 
289  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
290  buff,
291  composite_key_dict,
292  hash_entry_count,
293  key_handler,
294  num_elems,
295  for_window_framing);
296 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const bool  for_window_framing 
)

Definition at line 450 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

457  {
458  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
459  composite_key_dict,
460  hash_entry_count,
461  key_handler,
462  num_elems,
463  for_window_framing);
464 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const bool  for_window_framing 
)

Definition at line 466 of file HashJoinRuntimeGpu.cu.

472  {
473  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
474  composite_key_dict,
475  hash_entry_count,
476  key_handler,
477  num_elems,
478  for_window_framing);
479 }
void fill_one_to_many_hash_table_on_device ( OneToManyPerfectJoinHashTableFillFuncArgs const  args)

Definition at line 175 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, BucketizedHashEntryInfo::bucketized_hash_entry_count, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), OneToManyPerfectJoinHashTableFillFuncArgs::for_window_framing, OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

176  {
177  auto buff = args.buff;
178  auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;
179  auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {
181  SUFFIX(count_matches), count_buff, args.join_column, args.type_info);
182  };
183  auto fill_row_ids_func = [buff, hash_entry_count, &args] {
185  buff,
186  hash_entry_count,
187  args.join_column,
188  args.type_info,
189  args.for_window_framing);
190  };
192  hash_entry_count,
193  args.join_column,
194  args.type_info,
195  count_matches_func,
196  fill_row_ids_func);
197 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( OneToManyPerfectJoinHashTableFillFuncArgs const  args)

Definition at line 199 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, OneToManyPerfectJoinHashTableFillFuncArgs::bucket_normalization, OneToManyPerfectJoinHashTableFillFuncArgs::buff, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), BucketizedHashEntryInfo::getNormalizedHashEntryCount(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, OneToManyPerfectJoinHashTableFillFuncArgs::join_column, SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

200  {
201  auto hash_entry_count = args.hash_entry_info.getNormalizedHashEntryCount();
202  auto const buff = args.buff;
203  auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {
205  count_buff,
206  args.join_column,
207  args.type_info,
208  args.bucket_normalization);
209  };
210  auto fill_row_ids_func = [buff, hash_entry_count, &args] {
212  buff,
213  hash_entry_count,
214  args.join_column,
215  args.type_info,
216  args.bucket_normalization);
217  };
219  hash_entry_count,
220  args.join_column,
221  args.type_info,
222  count_matches_func,
223  fill_row_ids_func);
224 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
COUNT_MATCHES_FUNCTOR  count_matches_func,
FILL_ROW_IDS_FUNCTOR  fill_row_ids_func 
)

Definition at line 148 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, cuda_kernel_launch_wrapper(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

Referenced by fill_one_to_many_hash_table_on_device(), and fill_one_to_many_hash_table_on_device_bucketized().

153  {
154  int32_t* pos_buff = buff;
155  int32_t* count_buff = buff + hash_entry_count;
156  auto qe_cuda_stream = getQueryEngineCudaStream();
158  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
159  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
160  count_matches_func();
161 
162  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
163 
164  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
166  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
167 
168  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
170  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
171  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
172  fill_row_ids_func();
173 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( OneToManyPerfectJoinHashTableFillFuncArgs const  args,
ShardInfo const  shard_info 
)

Definition at line 226 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, OneToManyPerfectJoinHashTableFillFuncArgs::buff, checkCudaErrors, count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), getQueryEngineCudaStream(), OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info, inclusive_scan(), OneToManyPerfectJoinHashTableFillFuncArgs::join_column, set_valid_pos(), set_valid_pos_flag(), SUFFIX, and OneToManyPerfectJoinHashTableFillFuncArgs::type_info.

228  {
229  auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;
230  int32_t* pos_buff = args.buff;
231  int32_t* count_buff = args.buff + hash_entry_count;
232  auto qe_cuda_stream = getQueryEngineCudaStream();
234  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
235  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
237  count_buff,
238  args.join_column,
239  args.type_info,
240  shard_info);
241 
242  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
243 
244  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
246  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
247  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
249  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
250  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
252  args.buff,
253  hash_entry_count,
254  args.join_column,
255  args.type_info,
256  shard_info);
257 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

CUstream getQueryEngineCudaStream ( )

Definition at line 3 of file QueryEngine.cpp.

3  { // NOTE: CUstream is cudaStream_t
4  return QueryEngine::getInstance()->getCudaStream();
5 }
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:89
void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 313 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

317  {
318  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
319  hash_join_buff,
320  entry_count,
321  key_component_count,
322  with_val_slot,
323  invalid_slot_val);
324 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 326 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

330  {
331  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
332  hash_join_buff,
333  entry_count,
334  key_component_count,
335  with_val_slot,
336  invalid_slot_val);
337 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 299 of file HashJoinRuntimeGpu.cu.

References init_baseline_hash_join_buff(), SUFFIX, and heavydb.dtypes::T.

303  {
304  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
305  entry_count,
306  key_component_count,
307  with_val_slot,
308  invalid_slot_val,
309  -1,
310  -1);
311 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 114 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

116  {
118  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
119 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void init_hash_join_buff_wrapper ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 108 of file HashJoinRuntimeGpu.cu.

References init_hash_join_buff(), and SUFFIX.

Referenced by init_hash_join_buff_on_device().

110  {
111  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
112 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void range_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 429 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

436  {
438  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
439  hash_buff,
440  entry_count,
441  invalid_slot_val,
442  false,
443  key_component_count,
444  with_val_slot,
445  dev_err_buff,
446  key_handler,
447  num_elems);
448 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const size_t  hash_entry_count,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 491 of file HashJoinRuntimeGpu.cu.

496  {
497  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
498  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
499 }
__global__ void set_valid_pos ( int32_t *  pos_buff,
int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 135 of file HashJoinRuntimeGpu.cu.

References VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

137  {
138  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
139  const int32_t step = blockDim.x * gridDim.x;
140  for (int64_t i = start; i < entry_count; i += step) {
141  if (VALID_POS_FLAG == pos_buff[i]) {
142  pos_buff[i] = !i ? 0 : count_buff[i - 1];
143  }
144  }
145 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function:

__global__ void set_valid_pos_flag ( int32_t *  pos_buff,
const int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 123 of file HashJoinRuntimeGpu.cu.

References VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

125  {
126  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
127  const int32_t step = blockDim.x * gridDim.x;
128  for (int64_t i = start; i < entry_count; i += step) {
129  if (count_buff[i]) {
130  pos_buff[i] = VALID_POS_FLAG;
131  }
132  }
133 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function: