OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntimeGpu.cu File Reference
#include "HashJoinRuntime.cpp"
#include <cuda.h>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
+ Include dependency graph for HashJoinRuntimeGpu.cu:

Go to the source code of this file.

Macros

#define checkCudaErrors(err)   CHECK_EQ(err, cudaSuccess)
 
#define VALID_POS_FLAG   0
 

Functions

CUstream getQueryEngineCudaStream ()
 
template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper (F func, ARGS &&...args)
 
__global__ void fill_hash_join_buff_wrapper (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
 
__global__ void fill_hash_join_buff_bucketized_wrapper (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
 
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
 
__global__ void fill_hash_join_buff_wrapper_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
 
void fill_hash_join_buff_on_device_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
 
__global__ void init_hash_join_buff_wrapper (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
__global__ void set_valid_pos_flag (int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
 
__global__ void set_valid_pos (int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
 
template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table_on_device (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const bool for_window_framing)
 
void fill_one_to_many_hash_table_on_device_bucketized (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_sharded (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
 
template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *key_handler, const size_t num_elems, const bool for_window_framing)
 
template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
 
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void approximate_distinct_tuples_on_device_overlaps (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_range (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_sz_threshold)
 

Macro Definition Documentation

#define checkCudaErrors (   err)    CHECK_EQ(err, cudaSuccess)

Definition at line 24 of file HashJoinRuntimeGpu.cu.

#define VALID_POS_FLAG   0

Definition at line 193 of file HashJoinRuntimeGpu.cu.

Referenced by set_valid_pos(), and set_valid_pos_flag().

Function Documentation

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 625 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

628  {
629  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
630  hll_buffer,
631  nullptr,
632  b,
633  num_elems,
634  key_handler);
635 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_overlaps ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 591 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

595  {
596  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
597  hll_buffer,
598  row_counts_buffer,
599  b,
600  num_elems,
601  key_handler);
602 
603  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
605  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
606 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 608 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, getQueryEngineCudaStream(), and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

614  {
615  auto qe_cuda_stream = getQueryEngineCudaStream();
616  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
617  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
618  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
619 
620  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
622  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
623 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_sz_threshold 
)

Definition at line 637 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

640  {
641  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
642  bucket_sizes_buffer,
643  join_column,
644  type_info,
645  bucket_sz_threshold);
646 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper ( func,
ARGS &&...  args 
)

Definition at line 27 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, checkCudaErrors, and getQueryEngineCudaStream().

Referenced by approximate_distinct_tuples_on_device(), approximate_distinct_tuples_on_device_overlaps(), compute_bucket_sizes_on_device(), fill_baseline_hash_join_buff_on_device_32(), fill_baseline_hash_join_buff_on_device_64(), fill_hash_join_buff_on_device(), fill_hash_join_buff_on_device_bucketized(), fill_hash_join_buff_on_device_sharded(), fill_hash_join_buff_on_device_sharded_bucketized(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device(), fill_one_to_many_hash_table_on_device_bucketized(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), init_baseline_hash_join_buff_on_device_32(), init_baseline_hash_join_buff_on_device_64(), init_hash_join_buff_on_device(), overlaps_fill_baseline_hash_join_buff_on_device_64(), and range_fill_baseline_hash_join_buff_on_device_64().

27  {
28  int grid_size = -1;
29  int block_size = -1;
30  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
31  auto qe_cuda_stream = getQueryEngineCudaStream();
32  func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(args)...);
33  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
34 }
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 453 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

461  {
463  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
464  hash_buff,
465  entry_count,
466  invalid_slot_val,
467  for_semi_join,
468  key_component_count,
469  with_val_slot,
470  dev_err_buff,
471  key_handler,
472  num_elems);
473 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 475 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

483  {
485  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
486  hash_buff,
487  entry_count,
488  invalid_slot_val,
489  for_semi_join,
490  key_component_count,
491  with_val_slot,
492  dev_err_buff,
493  key_handler,
494  num_elems);
495 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  err,
const KEY_HANDLER *  key_handler,
const int64_t  num_elems 
)

Definition at line 431 of file HashJoinRuntimeGpu.cu.

References fill_baseline_hash_join_buff(), SUFFIX, and heavydb.dtypes::T.

439  {
440  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
441  entry_count,
442  invalid_slot_val,
443  for_semi_join,
444  key_component_count,
445  with_val_slot,
446  key_handler,
447  num_elems,
448  -1,
449  -1);
450  atomicCAS(err, 0, partial_err);
451 }
#define SUFFIX(name)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_bucketized_wrapper ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
int *  err,
const int64_t  bucket_normalization 
)

Definition at line 47 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_bucketized().

54  {
55  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
56  invalid_slot_val,
57  for_semi_join,
58  join_column,
59  type_info,
60  NULL,
61  NULL,
62  -1,
63  -1,
64  bucket_normalization);
65  atomicCAS(err, 0, partial_err);
66 }
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
#define SUFFIX(name)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info 
)

Definition at line 85 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

90  {
92  buff,
93  invalid_slot_val,
94  for_semi_join,
95  join_column,
96  type_info,
97  dev_err_buff);
98 }
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int64_t  bucket_normalization 
)

Definition at line 68 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

74  {
76  buff,
77  invalid_slot_val,
78  for_semi_join,
79  join_column,
80  type_info,
81  dev_err_buff,
82  bucket_normalization);
83 }
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info 
)

Definition at line 163 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

169  {
171  buff,
172  invalid_slot_val,
173  for_semi_join,
174  join_column,
175  type_info,
176  shard_info,
177  dev_err_buff);
178 }
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int64_t  bucket_normalization 
)

Definition at line 143 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

151  {
153  buff,
154  invalid_slot_val,
155  for_semi_join,
156  join_column,
157  type_info,
158  shard_info,
159  dev_err_buff,
160  bucket_normalization);
161 }
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_wrapper ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
int *  err 
)

Definition at line 36 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device().

41  {
42  int partial_err = SUFFIX(fill_hash_join_buff)(
43  buff, invalid_slot_val, for_semi_join, join_column, type_info, NULL, NULL, -1, -1);
44  atomicCAS(err, 0, partial_err);
45 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
int *  err 
)

Definition at line 123 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_sharded(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_sharded().

129  {
130  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(buff,
131  invalid_slot_val,
132  for_semi_join,
133  join_column,
134  type_info,
135  shard_info,
136  NULL,
137  NULL,
138  -1,
139  -1);
140  atomicCAS(err, 0, partial_err);
141 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
int *  err,
const int64_t  bucket_normalization 
)

Definition at line 100 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_sharded_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_sharded_bucketized().

108  {
109  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
110  invalid_slot_val,
111  for_semi_join,
112  join_column,
113  type_info,
114  shard_info,
115  NULL,
116  NULL,
117  -1,
118  -1,
119  bucket_normalization);
120  atomicCAS(err, 0, partial_err);
121 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const KEY_HANDLER *  key_handler,
const size_t  num_elems,
const bool  for_window_framing 
)

Definition at line 351 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, cuda_kernel_launch_wrapper(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

356  {
357  auto pos_buff = buff;
358  auto count_buff = buff + hash_entry_count;
359  auto qe_cuda_stream = getQueryEngineCudaStream();
361  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
362  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
363  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
364  count_buff,
365  composite_key_dict,
366  hash_entry_count,
367  key_handler,
368  num_elems);
369 
370  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
371 
372  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
374  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
375  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
377  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
378  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
379 
380  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
381  buff,
382  composite_key_dict,
383  hash_entry_count,
384  key_handler,
385  num_elems,
386  for_window_framing);
387 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const bool  for_window_framing 
)

Definition at line 540 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

547  {
548  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
549  composite_key_dict,
550  hash_entry_count,
551  key_handler,
552  num_elems,
553  for_window_framing);
554 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const bool  for_window_framing 
)

Definition at line 556 of file HashJoinRuntimeGpu.cu.

562  {
563  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
564  composite_key_dict,
565  hash_entry_count,
566  key_handler,
567  num_elems,
568  for_window_framing);
569 }
void fill_one_to_many_hash_table_on_device ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const bool  for_window_framing 
)

Definition at line 247 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), and SUFFIX.

251  {
252  auto hash_entry_count = hash_entry_info.bucketized_hash_entry_count;
253  auto count_matches_func = [count_buff = buff + hash_entry_count,
254  join_column,
255  type_info] {
256  cuda_kernel_launch_wrapper(SUFFIX(count_matches), count_buff, join_column, type_info);
257  };
258 
259  auto fill_row_ids_func =
260  [buff, hash_entry_count, join_column, type_info, for_window_framing] {
262  buff,
263  hash_entry_count,
264  join_column,
265  type_info,
266  for_window_framing);
267  };
268 
270  hash_entry_count,
271  join_column,
272  type_info,
273  count_matches_func,
274  fill_row_ids_func);
275 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
size_t bucketized_hash_entry_count

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 277 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucket_normalization, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), BucketizedHashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

281  {
282  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
283  auto count_matches_func = [count_buff = buff + hash_entry_count,
284  join_column,
285  type_info,
286  bucket_normalization =
287  hash_entry_info.bucket_normalization] {
289  count_buff,
290  join_column,
291  type_info,
292  bucket_normalization);
293  };
294 
295  auto fill_row_ids_func = [buff,
296  hash_entry_count =
297  hash_entry_info.getNormalizedHashEntryCount(),
298  join_column,
299  type_info,
300  bucket_normalization = hash_entry_info.bucket_normalization] {
302  buff,
303  hash_entry_count,
304  join_column,
305  type_info,
306  bucket_normalization);
307  };
308 
310  hash_entry_count,
311  join_column,
312  type_info,
313  count_matches_func,
314  fill_row_ids_func);
315 }
#define SUFFIX(name)
int64_t bucket_normalization
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
COUNT_MATCHES_FUNCTOR  count_matches_func,
FILL_ROW_IDS_FUNCTOR  fill_row_ids_func 
)

Definition at line 220 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, cuda_kernel_launch_wrapper(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

Referenced by fill_one_to_many_hash_table_on_device(), and fill_one_to_many_hash_table_on_device_bucketized().

225  {
226  int32_t* pos_buff = buff;
227  int32_t* count_buff = buff + hash_entry_count;
228  auto qe_cuda_stream = getQueryEngineCudaStream();
230  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
231  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
232  count_matches_func();
233 
234  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
235 
236  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
238  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
239 
240  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
242  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
243  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
244  fill_row_ids_func();
245 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info 
)

Definition at line 317 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, checkCudaErrors, count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), set_valid_pos_flag(), and SUFFIX.

322  {
323  auto hash_entry_count = hash_entry_info.bucketized_hash_entry_count;
324  int32_t* pos_buff = buff;
325  int32_t* count_buff = buff + hash_entry_count;
326  auto qe_cuda_stream = getQueryEngineCudaStream();
328  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
329  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
331  SUFFIX(count_matches_sharded), count_buff, join_column, type_info, shard_info);
332 
333  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
334 
335  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
337  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
338  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
340  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
341  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
343  buff,
344  hash_entry_count,
345  join_column,
346  type_info,
347  shard_info);
348 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
size_t bucketized_hash_entry_count
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

CUstream getQueryEngineCudaStream ( )

Definition at line 3 of file QueryEngine.cpp.

3  { // NOTE: CUstream is cudaStream_t
4  return QueryEngine::getInstance()->getCudaStream();
5 }
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:81
void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 404 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

408  {
409  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
410  hash_join_buff,
411  entry_count,
412  key_component_count,
413  with_val_slot,
414  invalid_slot_val);
415 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 417 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

421  {
422  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
423  hash_join_buff,
424  entry_count,
425  key_component_count,
426  with_val_slot,
427  invalid_slot_val);
428 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 390 of file HashJoinRuntimeGpu.cu.

References init_baseline_hash_join_buff(), SUFFIX, and heavydb.dtypes::T.

394  {
395  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
396  entry_count,
397  key_component_count,
398  with_val_slot,
399  invalid_slot_val,
400  -1,
401  -1);
402 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 186 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

188  {
190  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
191 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void init_hash_join_buff_wrapper ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 180 of file HashJoinRuntimeGpu.cu.

References init_hash_join_buff(), and SUFFIX.

Referenced by init_hash_join_buff_on_device().

182  {
183  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
184 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void overlaps_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 497 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

505  {
507  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
508  hash_buff,
509  entry_count,
510  invalid_slot_val,
511  false,
512  key_component_count,
513  with_val_slot,
514  dev_err_buff,
515  key_handler,
516  num_elems);
517 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 571 of file HashJoinRuntimeGpu.cu.

576  {
577  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
578  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
579 }
void range_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 519 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

526  {
528  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
529  hash_buff,
530  entry_count,
531  invalid_slot_val,
532  false,
533  key_component_count,
534  with_val_slot,
535  dev_err_buff,
536  key_handler,
537  num_elems);
538 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const size_t  hash_entry_count,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 581 of file HashJoinRuntimeGpu.cu.

586  {
587  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
588  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
589 }
__global__ void set_valid_pos ( int32_t *  pos_buff,
int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 207 of file HashJoinRuntimeGpu.cu.

References VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

209  {
210  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
211  const int32_t step = blockDim.x * gridDim.x;
212  for (int64_t i = start; i < entry_count; i += step) {
213  if (VALID_POS_FLAG == pos_buff[i]) {
214  pos_buff[i] = !i ? 0 : count_buff[i - 1];
215  }
216  }
217 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function:

__global__ void set_valid_pos_flag ( int32_t *  pos_buff,
const int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 195 of file HashJoinRuntimeGpu.cu.

References VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

197  {
198  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
199  const int32_t step = blockDim.x * gridDim.x;
200  for (int64_t i = start; i < entry_count; i += step) {
201  if (count_buff[i]) {
202  pos_buff[i] = VALID_POS_FLAG;
203  }
204  }
205 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function: