OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntimeGpu.cu File Reference
#include "HashJoinRuntime.cpp"
#include <cuda.h>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
+ Include dependency graph for HashJoinRuntimeGpu.cu:

Go to the source code of this file.

Macros

#define checkCudaErrors(err)   CHECK_EQ(err, cudaSuccess)
 
#define VALID_POS_FLAG   0
 

Functions

CUstream getQueryEngineCudaStream ()
 
template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper (F func, ARGS &&...args)
 
__global__ void fill_hash_join_buff_wrapper (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
 
__global__ void fill_hash_join_buff_bucketized_wrapper (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
 
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
 
__global__ void fill_hash_join_buff_wrapper_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
 
void fill_hash_join_buff_on_device_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
 
__global__ void init_hash_join_buff_wrapper (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
 
__global__ void set_valid_pos_flag (int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
 
__global__ void set_valid_pos (int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
 
template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl (int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
 
void fill_one_to_many_hash_table_on_device (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_sharded (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
 
template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device (int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *key_handler, const size_t num_elems)
 
template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void approximate_distinct_tuples_on_device_overlaps (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_range (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_sz_threshold)
 

Macro Definition Documentation

#define checkCudaErrors (   err)    CHECK_EQ(err, cudaSuccess)

Definition at line 24 of file HashJoinRuntimeGpu.cu.

#define VALID_POS_FLAG   0

Definition at line 193 of file HashJoinRuntimeGpu.cu.

Referenced by set_valid_pos(), and set_valid_pos_flag().

Function Documentation

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 606 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

609  {
610  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
611  hll_buffer,
612  nullptr,
613  b,
614  num_elems,
615  key_handler);
616 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_overlaps ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 572 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

576  {
577  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
578  hll_buffer,
579  row_counts_buffer,
580  b,
581  num_elems,
582  key_handler);
583 
584  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
586  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
587 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 589 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, getQueryEngineCudaStream(), and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

595  {
596  auto qe_cuda_stream = getQueryEngineCudaStream();
597  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
598  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
599  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
600 
601  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
603  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
604 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_sz_threshold 
)

Definition at line 618 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

621  {
622  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
623  bucket_sizes_buffer,
624  join_column,
625  type_info,
626  bucket_sz_threshold);
627 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename F , typename... ARGS>
void cuda_kernel_launch_wrapper ( func,
ARGS &&...  args 
)

Definition at line 27 of file HashJoinRuntimeGpu.cu.

References run_benchmark_import::args, checkCudaErrors, and getQueryEngineCudaStream().

Referenced by approximate_distinct_tuples_on_device(), approximate_distinct_tuples_on_device_overlaps(), compute_bucket_sizes_on_device(), fill_baseline_hash_join_buff_on_device_32(), fill_baseline_hash_join_buff_on_device_64(), fill_hash_join_buff_on_device(), fill_hash_join_buff_on_device_bucketized(), fill_hash_join_buff_on_device_sharded(), fill_hash_join_buff_on_device_sharded_bucketized(), fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device(), fill_one_to_many_hash_table_on_device_bucketized(), fill_one_to_many_hash_table_on_device_impl(), fill_one_to_many_hash_table_on_device_sharded(), init_baseline_hash_join_buff_on_device_32(), init_baseline_hash_join_buff_on_device_64(), init_hash_join_buff_on_device(), overlaps_fill_baseline_hash_join_buff_on_device_64(), and range_fill_baseline_hash_join_buff_on_device_64().

27  {
28  int grid_size = -1;
29  int block_size = -1;
30  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
31  auto qe_cuda_stream = getQueryEngineCudaStream();
32  func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(args)...);
33  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
34 }
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 444 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

452  {
454  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
455  hash_buff,
456  entry_count,
457  invalid_slot_val,
458  for_semi_join,
459  key_component_count,
460  with_val_slot,
461  dev_err_buff,
462  key_handler,
463  num_elems);
464 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 466 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

474  {
476  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
477  hash_buff,
478  entry_count,
479  invalid_slot_val,
480  for_semi_join,
481  key_component_count,
482  with_val_slot,
483  dev_err_buff,
484  key_handler,
485  num_elems);
486 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

template<typename T , typename KEY_HANDLER >
__global__ void fill_baseline_hash_join_buff_wrapper ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  err,
const KEY_HANDLER *  key_handler,
const int64_t  num_elems 
)

Definition at line 422 of file HashJoinRuntimeGpu.cu.

References fill_baseline_hash_join_buff(), SUFFIX, and heavydb.dtypes::T.

430  {
431  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
432  entry_count,
433  invalid_slot_val,
434  for_semi_join,
435  key_component_count,
436  with_val_slot,
437  key_handler,
438  num_elems,
439  -1,
440  -1);
441  atomicCAS(err, 0, partial_err);
442 }
#define SUFFIX(name)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_bucketized_wrapper ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
int *  err,
const int64_t  bucket_normalization 
)

Definition at line 47 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_bucketized().

54  {
55  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
56  invalid_slot_val,
57  for_semi_join,
58  join_column,
59  type_info,
60  NULL,
61  NULL,
62  -1,
63  -1,
64  bucket_normalization);
65  atomicCAS(err, 0, partial_err);
66 }
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
#define SUFFIX(name)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info 
)

Definition at line 85 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

90  {
92  buff,
93  invalid_slot_val,
94  for_semi_join,
95  join_column,
96  type_info,
97  dev_err_buff);
98 }
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int64_t  bucket_normalization 
)

Definition at line 68 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

74  {
76  buff,
77  invalid_slot_val,
78  for_semi_join,
79  join_column,
80  type_info,
81  dev_err_buff,
82  bucket_normalization);
83 }
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info 
)

Definition at line 163 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

169  {
171  buff,
172  invalid_slot_val,
173  for_semi_join,
174  join_column,
175  type_info,
176  shard_info,
177  dev_err_buff);
178 }
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int64_t  bucket_normalization 
)

Definition at line 143 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

151  {
153  buff,
154  invalid_slot_val,
155  for_semi_join,
156  join_column,
157  type_info,
158  shard_info,
159  dev_err_buff,
160  bucket_normalization);
161 }
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

__global__ void fill_hash_join_buff_wrapper ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
int *  err 
)

Definition at line 36 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device().

41  {
42  int partial_err = SUFFIX(fill_hash_join_buff)(
43  buff, invalid_slot_val, for_semi_join, join_column, type_info, NULL, NULL, -1, -1);
44  atomicCAS(err, 0, partial_err);
45 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
int *  err 
)

Definition at line 123 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_sharded(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_sharded().

129  {
130  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(buff,
131  invalid_slot_val,
132  for_semi_join,
133  join_column,
134  type_info,
135  shard_info,
136  NULL,
137  NULL,
138  -1,
139  -1);
140  atomicCAS(err, 0, partial_err);
141 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void fill_hash_join_buff_wrapper_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
int *  err,
const int64_t  bucket_normalization 
)

Definition at line 100 of file HashJoinRuntimeGpu.cu.

References fill_hash_join_buff_sharded_bucketized(), and SUFFIX.

Referenced by fill_hash_join_buff_on_device_sharded_bucketized().

108  {
109  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
110  invalid_slot_val,
111  for_semi_join,
112  join_column,
113  type_info,
114  shard_info,
115  NULL,
116  NULL,
117  -1,
118  -1,
119  bucket_normalization);
120  atomicCAS(err, 0, partial_err);
121 }
#define SUFFIX(name)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename KEY_HANDLER >
void fill_one_to_many_baseline_hash_table_on_device ( int32_t *  buff,
const T *  composite_key_dict,
const int64_t  hash_entry_count,
const KEY_HANDLER *  key_handler,
const size_t  num_elems 
)

Definition at line 344 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, cuda_kernel_launch_wrapper(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

348  {
349  auto pos_buff = buff;
350  auto count_buff = buff + hash_entry_count;
351  auto qe_cuda_stream = getQueryEngineCudaStream();
353  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
354  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
355  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
356  count_buff,
357  composite_key_dict,
358  hash_entry_count,
359  key_handler,
360  num_elems);
361 
362  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
363 
364  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
366  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
367  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
369  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
370  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
371 
372  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
373  buff,
374  composite_key_dict,
375  hash_entry_count,
376  key_handler,
377  num_elems);
378 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 531 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

537  {
538  fill_one_to_many_baseline_hash_table_on_device<int32_t>(
539  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
540 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 542 of file HashJoinRuntimeGpu.cu.

547  {
548  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
549  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
550 }
void fill_one_to_many_hash_table_on_device ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 247 of file HashJoinRuntimeGpu.cu.

References count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

250  {
251  auto hash_entry_count = hash_entry_info.hash_entry_count;
252  auto count_matches_func = [count_buff = buff + hash_entry_count,
253  join_column,
254  type_info] {
255  cuda_kernel_launch_wrapper(SUFFIX(count_matches), count_buff, join_column, type_info);
256  };
257 
258  auto fill_row_ids_func = [buff, hash_entry_count, join_column, type_info] {
260  SUFFIX(fill_row_ids), buff, hash_entry_count, join_column, type_info);
261  };
262 
264  hash_entry_count,
265  join_column,
266  type_info,
267  count_matches_func,
268  fill_row_ids_func);
269 }
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
size_t hash_entry_count
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 271 of file HashJoinRuntimeGpu.cu.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

275  {
276  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
277  auto count_matches_func = [count_buff = buff + hash_entry_count,
278  join_column,
279  type_info,
280  bucket_normalization =
281  hash_entry_info.bucket_normalization] {
283  count_buff,
284  join_column,
285  type_info,
286  bucket_normalization);
287  };
288 
289  auto fill_row_ids_func = [buff,
290  hash_entry_count =
291  hash_entry_info.getNormalizedHashEntryCount(),
292  join_column,
293  type_info,
294  bucket_normalization = hash_entry_info.bucket_normalization] {
296  buff,
297  hash_entry_count,
298  join_column,
299  type_info,
300  bucket_normalization);
301  };
302 
304  hash_entry_count,
305  join_column,
306  type_info,
307  count_matches_func,
308  fill_row_ids_func);
309 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
size_t getNormalizedHashEntryCount() const
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

template<typename COUNT_MATCHES_FUNCTOR , typename FILL_ROW_IDS_FUNCTOR >
void fill_one_to_many_hash_table_on_device_impl ( int32_t *  buff,
const int64_t  hash_entry_count,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
COUNT_MATCHES_FUNCTOR  count_matches_func,
FILL_ROW_IDS_FUNCTOR  fill_row_ids_func 
)

Definition at line 220 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, cuda_kernel_launch_wrapper(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), and set_valid_pos_flag().

Referenced by fill_one_to_many_hash_table_on_device(), and fill_one_to_many_hash_table_on_device_bucketized().

225  {
226  int32_t* pos_buff = buff;
227  int32_t* count_buff = buff + hash_entry_count;
228  auto qe_cuda_stream = getQueryEngineCudaStream();
230  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
231  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
232  count_matches_func();
233 
234  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
235 
236  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
238  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
239 
240  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
242  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
243  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
244  fill_row_ids_func();
245 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info 
)

Definition at line 311 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), getQueryEngineCudaStream(), HashEntryInfo::hash_entry_count, inclusive_scan(), set_valid_pos(), set_valid_pos_flag(), and SUFFIX.

315  {
316  auto hash_entry_count = hash_entry_info.hash_entry_count;
317  int32_t* pos_buff = buff;
318  int32_t* count_buff = buff + hash_entry_count;
319  auto qe_cuda_stream = getQueryEngineCudaStream();
321  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
322  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
324  SUFFIX(count_matches_sharded), count_buff, join_column, type_info, shard_info);
325 
326  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
327 
328  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
330  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
331  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
333  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
334  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
336  buff,
337  hash_entry_count,
338  join_column,
339  type_info,
340  shard_info);
341 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
size_t hash_entry_count
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

CUstream getQueryEngineCudaStream ( )

Definition at line 3 of file QueryEngine.cpp.

3  { // NOTE: CUstream is cudaStream_t
4  return QueryEngine::getInstance()->getCudaStream();
5 }
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:81
void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 395 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

399  {
400  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
401  hash_join_buff,
402  entry_count,
403  key_component_count,
404  with_val_slot,
405  invalid_slot_val);
406 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 408 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

412  {
413  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
414  hash_join_buff,
415  entry_count,
416  key_component_count,
417  with_val_slot,
418  invalid_slot_val);
419 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
__global__ void init_baseline_hash_join_buff_wrapper ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 381 of file HashJoinRuntimeGpu.cu.

References init_baseline_hash_join_buff(), SUFFIX, and heavydb.dtypes::T.

385  {
386  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
387  entry_count,
388  key_component_count,
389  with_val_slot,
390  invalid_slot_val,
391  -1,
392  -1);
393 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 186 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

188  {
190  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
191 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__global__ void init_hash_join_buff_wrapper ( int32_t *  buff,
const int64_t  hash_entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 180 of file HashJoinRuntimeGpu.cu.

References init_hash_join_buff(), and SUFFIX.

Referenced by init_hash_join_buff_on_device().

182  {
183  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
184 }
#define SUFFIX(name)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void overlaps_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 488 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

496  {
498  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
499  hash_buff,
500  entry_count,
501  invalid_slot_val,
502  false,
503  key_component_count,
504  with_val_slot,
505  dev_err_buff,
506  key_handler,
507  num_elems);
508 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 552 of file HashJoinRuntimeGpu.cu.

557  {
558  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
559  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
560 }
void range_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 510 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

517  {
519  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
520  hash_buff,
521  entry_count,
522  invalid_slot_val,
523  false,
524  key_component_count,
525  with_val_slot,
526  dev_err_buff,
527  key_handler,
528  num_elems);
529 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const size_t  hash_entry_count,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 562 of file HashJoinRuntimeGpu.cu.

567  {
568  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
569  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
570 }
__global__ void set_valid_pos ( int32_t *  pos_buff,
int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 207 of file HashJoinRuntimeGpu.cu.

References VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

209  {
210  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
211  const int32_t step = blockDim.x * gridDim.x;
212  for (int64_t i = start; i < entry_count; i += step) {
213  if (VALID_POS_FLAG == pos_buff[i]) {
214  pos_buff[i] = !i ? 0 : count_buff[i - 1];
215  }
216  }
217 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function:

__global__ void set_valid_pos_flag ( int32_t *  pos_buff,
const int32_t *  count_buff,
const int64_t  entry_count 
)

Definition at line 195 of file HashJoinRuntimeGpu.cu.

References VALID_POS_FLAG.

Referenced by fill_one_to_many_baseline_hash_table_on_device(), fill_one_to_many_hash_table_on_device_impl(), and fill_one_to_many_hash_table_on_device_sharded().

197  {
198  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
199  const int32_t step = blockDim.x * gridDim.x;
200  for (int64_t i = start; i < entry_count; i += step) {
201  if (count_buff[i]) {
202  pos_buff[i] = VALID_POS_FLAG;
203  }
204  }
205 }
#define VALID_POS_FLAG

+ Here is the caller graph for this function: