_hash_join_runtime_gpu_8cu_source.html

 /*

  * Copyright 2022 HEAVY.AI, Inc.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #include "HashJoinRuntime.cpp"


 #include <cuda.h>

 CUstream getQueryEngineCudaStream();


 #include <thrust/device_ptr.h>

 #include <thrust/scan.h>


 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)


 template <typename F, typename... ARGS>

 void cuda_kernel_launch_wrapper(F func, ARGS&&... args) {

   int grid_size = -1;

   int block_size = -1;

   checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));

   auto qe_cuda_stream = getQueryEngineCudaStream();

   func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(args)...);

   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));

 }


 __global__ void fill_hash_join_buff_wrapper(

     OneToOnePerfectJoinHashTableFillFuncArgs const args) {

   auto fill_hash_join_buff_func = args.type_info.uses_bw_eq

                                       ? SUFFIX(fill_hash_join_buff_bitwise_eq)

                                       : SUFFIX(fill_hash_join_buff);

   int partial_err = fill_hash_join_buff_func(args, -1, -1);

   atomicCAS(args.dev_err_buff, 0, partial_err);

 }


 __global__ void fill_hash_join_buff_bucketized_wrapper(

     OneToOnePerfectJoinHashTableFillFuncArgs const args) {

   int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(args, -1, -1);

   atomicCAS(args.dev_err_buff, 0, partial_err);

 }


 void fill_hash_join_buff_on_device_bucketized(

     OneToOnePerfectJoinHashTableFillFuncArgs const args) {

   cuda_kernel_launch_wrapper(fill_hash_join_buff_bucketized_wrapper, args);

 }


 void fill_hash_join_buff_on_device(OneToOnePerfectJoinHashTableFillFuncArgs const args) {

   cuda_kernel_launch_wrapper(fill_hash_join_buff_wrapper, args);

 }


 __global__ void fill_hash_join_buff_wrapper_sharded_bucketized(

     OneToOnePerfectJoinHashTableFillFuncArgs const args,

     ShardInfo const shard_info) {

   int partial_err =

       SUFFIX(fill_hash_join_buff_sharded_bucketized)(args.buff,

                                                      args.invalid_slot_val,

                                                      args.for_semi_join,

                                                      args.join_column,

                                                      args.type_info,

                                                      shard_info,

                                                      NULL,

                                                      NULL,

                                                      -1,

                                                      -1,

                                                      args.bucket_normalization);

   atomicCAS(args.dev_err_buff, 0, partial_err);

 }


 __global__ void fill_hash_join_buff_wrapper_sharded(

     OneToOnePerfectJoinHashTableFillFuncArgs const args,

     ShardInfo const shard_info) {

   int partial_err = SUFFIX(fill_hash_join_buff_sharded)(args.buff,

                                                         args.invalid_slot_val,

                                                         args.for_semi_join,

                                                         args.join_column,

                                                         args.type_info,

                                                         shard_info,

                                                         NULL,

                                                         NULL,

                                                         -1,

                                                         -1);

   atomicCAS(args.dev_err_buff, 0, partial_err);

 }


 void fill_hash_join_buff_on_device_sharded_bucketized(

     OneToOnePerfectJoinHashTableFillFuncArgs const args,

     ShardInfo const shard_info) {

   cuda_kernel_launch_wrapper(

       fill_hash_join_buff_wrapper_sharded_bucketized, args, shard_info);

 }


 void fill_hash_join_buff_on_device_sharded(

     OneToOnePerfectJoinHashTableFillFuncArgs const args,

     ShardInfo const shard_info) {

   cuda_kernel_launch_wrapper(fill_hash_join_buff_wrapper_sharded, args, shard_info);

 }


 __global__ void init_hash_join_buff_wrapper(int32_t* buff,

                                             const int64_t hash_entry_count,

                                             const int32_t invalid_slot_val) {

   SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);

 }


 void init_hash_join_buff_on_device(int32_t* buff,

                                    const int64_t hash_entry_count,

                                    const int32_t invalid_slot_val) {

   cuda_kernel_launch_wrapper(

       init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);

 }


 #define VALID_POS_FLAG 0


 __global__ void set_valid_pos_flag(int32_t* pos_buff,

                                    const int32_t* count_buff,

                                    const int64_t entry_count) {

   const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   const int32_t step = blockDim.x * gridDim.x;

   for (int64_t i = start; i < entry_count; i += step) {

     if (count_buff[i]) {

       pos_buff[i] = VALID_POS_FLAG;

     }

   }

 }


 __global__ void set_valid_pos(int32_t* pos_buff,

                               int32_t* count_buff,

                               const int64_t entry_count) {

   const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   const int32_t step = blockDim.x * gridDim.x;

   for (int64_t i = start; i < entry_count; i += step) {

     if (VALID_POS_FLAG == pos_buff[i]) {

       pos_buff[i] = !i ? 0 : count_buff[i - 1];

     }

   }

 }


 template <typename COUNT_MATCHES_FUNCTOR, typename FILL_ROW_IDS_FUNCTOR>

 void fill_one_to_many_hash_table_on_device_impl(int32_t* buff,

                                                 const int64_t hash_entry_count,

                                                 const JoinColumn& join_column,

                                                 const JoinColumnTypeInfo& type_info,

                                                 COUNT_MATCHES_FUNCTOR count_matches_func,

                                                 FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {

   int32_t* pos_buff = buff;

   int32_t* count_buff = buff + hash_entry_count;

   auto qe_cuda_stream = getQueryEngineCudaStream();

   checkCudaErrors(

       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));

   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));

   count_matches_func();


   cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);


   auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);

   thrust::inclusive_scan(

       count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);


   cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);

   checkCudaErrors(

       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));

   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));

   fill_row_ids_func();

 }


 void fill_one_to_many_hash_table_on_device(

     OneToManyPerfectJoinHashTableFillFuncArgs const args) {

   auto buff = args.buff;

   auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;

   auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {

     cuda_kernel_launch_wrapper(

         SUFFIX(count_matches), count_buff, args.join_column, args.type_info);

   };

   auto fill_row_ids_func = [buff, hash_entry_count, &args] {

     cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids),

                                buff,

                                hash_entry_count,

                                args.join_column,

                                args.type_info,

                                args.for_window_framing);

   };

   fill_one_to_many_hash_table_on_device_impl(buff,

                                              hash_entry_count,

                                              args.join_column,

                                              args.type_info,

                                              count_matches_func,

                                              fill_row_ids_func);

 }


 void fill_one_to_many_hash_table_on_device_bucketized(

     OneToManyPerfectJoinHashTableFillFuncArgs const args) {

   auto hash_entry_count = args.hash_entry_info.getNormalizedHashEntryCount();

   auto const buff = args.buff;

   auto count_matches_func = [count_buff = buff + hash_entry_count, &args] {

     cuda_kernel_launch_wrapper(SUFFIX(count_matches_bucketized),

                                count_buff,

                                args.join_column,

                                args.type_info,

                                args.bucket_normalization);

   };

   auto fill_row_ids_func = [buff, hash_entry_count, &args] {

     cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids_bucketized),

                                buff,

                                hash_entry_count,

                                args.join_column,

                                args.type_info,

                                args.bucket_normalization);

   };

   fill_one_to_many_hash_table_on_device_impl(buff,

                                              hash_entry_count,

                                              args.join_column,

                                              args.type_info,

                                              count_matches_func,

                                              fill_row_ids_func);

 }


 void fill_one_to_many_hash_table_on_device_sharded(

     OneToManyPerfectJoinHashTableFillFuncArgs const args,

     ShardInfo const shard_info) {

   auto hash_entry_count = args.hash_entry_info.bucketized_hash_entry_count;

   int32_t* pos_buff = args.buff;

   int32_t* count_buff = args.buff + hash_entry_count;

   auto qe_cuda_stream = getQueryEngineCudaStream();

   checkCudaErrors(

       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));

   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));

   cuda_kernel_launch_wrapper(SUFFIX(count_matches_sharded),

                              count_buff,

                              args.join_column,

                              args.type_info,

                              shard_info);


   cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);


   auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);

   thrust::inclusive_scan(

       count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);

   cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);

   checkCudaErrors(

       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));

   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));

   cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids_sharded),

                              args.buff,

                              hash_entry_count,

                              args.join_column,

                              args.type_info,

                              shard_info);

 }


 template <typename T, typename KEY_HANDLER>

 void fill_one_to_many_baseline_hash_table_on_device(int32_t* buff,

                                                     const T* composite_key_dict,

                                                     const int64_t hash_entry_count,

                                                     const KEY_HANDLER* key_handler,

                                                     const size_t num_elems,

                                                     const bool for_window_framing) {

   auto pos_buff = buff;

   auto count_buff = buff + hash_entry_count;

   auto qe_cuda_stream = getQueryEngineCudaStream();

   checkCudaErrors(

       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));

   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));

   cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,

                              count_buff,

                              composite_key_dict,

                              hash_entry_count,

                              key_handler,

                              num_elems);


   cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);


   auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);

   thrust::inclusive_scan(

       count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);

   cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);

   checkCudaErrors(

       cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));

   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));


   cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,

                              buff,

                              composite_key_dict,

                              hash_entry_count,

                              key_handler,

                              num_elems,

                              for_window_framing);

 }


 template <typename T>

 __global__ void init_baseline_hash_join_buff_wrapper(int8_t* hash_join_buff,

                                                      const int64_t entry_count,

                                                      const size_t key_component_count,

                                                      const bool with_val_slot,

                                                      const int32_t invalid_slot_val) {

   SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,

                                           entry_count,

                                           key_component_count,

                                           with_val_slot,

                                           invalid_slot_val,

                                           -1,

                                           -1);

 }


 void init_baseline_hash_join_buff_on_device_32(int8_t* hash_join_buff,

                                                const int64_t entry_count,

                                                const size_t key_component_count,

                                                const bool with_val_slot,

                                                const int32_t invalid_slot_val) {

   cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,

                              hash_join_buff,

                              entry_count,

                              key_component_count,

                              with_val_slot,

                              invalid_slot_val);

 }


 void init_baseline_hash_join_buff_on_device_64(int8_t* hash_join_buff,

                                                const int64_t entry_count,

                                                const size_t key_component_count,

                                                const bool with_val_slot,

                                                const int32_t invalid_slot_val) {

   cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,

                              hash_join_buff,

                              entry_count,

                              key_component_count,

                              with_val_slot,

                              invalid_slot_val);

 }


 template <typename T, typename KEY_HANDLER>

 __global__ void fill_baseline_hash_join_buff_wrapper(int8_t* hash_buff,

                                                      const int64_t entry_count,

                                                      const int32_t invalid_slot_val,

                                                      const bool for_semi_join,

                                                      const size_t key_component_count,

                                                      const bool with_val_slot,

                                                      int* err,

                                                      const KEY_HANDLER* key_handler,

                                                      const int64_t num_elems) {

   int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,

                                                             entry_count,

                                                             invalid_slot_val,

                                                             for_semi_join,

                                                             key_component_count,

                                                             with_val_slot,

                                                             key_handler,

                                                             num_elems,

                                                             -1,

                                                             -1);

   atomicCAS(err, 0, partial_err);

 }


 void fill_baseline_hash_join_buff_on_device_32(int8_t* hash_buff,

                                                const int64_t entry_count,

                                                const int32_t invalid_slot_val,

                                                const bool for_semi_join,

                                                const size_t key_component_count,

                                                const bool with_val_slot,

                                                int* dev_err_buff,

                                                const GenericKeyHandler* key_handler,

                                                const int64_t num_elems) {

   cuda_kernel_launch_wrapper(

       fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,

       hash_buff,

       entry_count,

       invalid_slot_val,

       for_semi_join,

       key_component_count,

       with_val_slot,

       dev_err_buff,

       key_handler,

       num_elems);

 }


 void fill_baseline_hash_join_buff_on_device_64(int8_t* hash_buff,

                                                const int64_t entry_count,

                                                const int32_t invalid_slot_val,

                                                const bool for_semi_join,

                                                const size_t key_component_count,

                                                const bool with_val_slot,

                                                int* dev_err_buff,

                                                const GenericKeyHandler* key_handler,

                                                const int64_t num_elems) {

   cuda_kernel_launch_wrapper(

       fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,

       hash_buff,

       entry_count,

       invalid_slot_val,

       for_semi_join,

       key_component_count,

       with_val_slot,

       dev_err_buff,

       key_handler,

       num_elems);

 }


 void bbox_intersect_fill_baseline_hash_join_buff_on_device_64(

     int8_t* hash_buff,

     const int64_t entry_count,

     const int32_t invalid_slot_val,

     const size_t key_component_count,

     const bool with_val_slot,

     int* dev_err_buff,

     const BoundingBoxIntersectKeyHandler* key_handler,

     const int64_t num_elems) {

   cuda_kernel_launch_wrapper(

       fill_baseline_hash_join_buff_wrapper<unsigned long long,

                                            BoundingBoxIntersectKeyHandler>,

       hash_buff,

       entry_count,

       invalid_slot_val,

       false,

       key_component_count,

       with_val_slot,

       dev_err_buff,

       key_handler,

       num_elems);

 }


 void range_fill_baseline_hash_join_buff_on_device_64(int8_t* hash_buff,

                                                      const int64_t entry_count,

                                                      const int32_t invalid_slot_val,

                                                      const size_t key_component_count,

                                                      const bool with_val_slot,

                                                      int* dev_err_buff,

                                                      const RangeKeyHandler* key_handler,

                                                      const size_t num_elems) {

   cuda_kernel_launch_wrapper(

       fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,

       hash_buff,

       entry_count,

       invalid_slot_val,

       false,

       key_component_count,

       with_val_slot,

       dev_err_buff,

       key_handler,

       num_elems);

 }


 void fill_one_to_many_baseline_hash_table_on_device_32(

     int32_t* buff,

     const int32_t* composite_key_dict,

     const int64_t hash_entry_count,

     const size_t key_component_count,

     const GenericKeyHandler* key_handler,

     const int64_t num_elems,

     const bool for_window_framing) {

   fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,

                                                           composite_key_dict,

                                                           hash_entry_count,

                                                           key_handler,

                                                           num_elems,

                                                           for_window_framing);

 }


 void fill_one_to_many_baseline_hash_table_on_device_64(

     int32_t* buff,

     const int64_t* composite_key_dict,

     const int64_t hash_entry_count,

     const GenericKeyHandler* key_handler,

     const int64_t num_elems,

     const bool for_window_framing) {

   fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,

                                                           composite_key_dict,

                                                           hash_entry_count,

                                                           key_handler,

                                                           num_elems,

                                                           for_window_framing);

 }


 void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64(

     int32_t* buff,

     const int64_t* composite_key_dict,

     const int64_t hash_entry_count,

     const BoundingBoxIntersectKeyHandler* key_handler,

     const int64_t num_elems) {

   fill_one_to_many_baseline_hash_table_on_device<int64_t>(

       buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);

 }


 void range_fill_one_to_many_baseline_hash_table_on_device_64(

     int32_t* buff,

     const int64_t* composite_key_dict,

     const size_t hash_entry_count,

     const RangeKeyHandler* key_handler,

     const size_t num_elems) {

   fill_one_to_many_baseline_hash_table_on_device<int64_t>(

       buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);

 }


 void approximate_distinct_tuples_on_device_bbox_intersect(

     uint8_t* hll_buffer,

     const uint32_t b,

     int32_t* row_counts_buffer,

     const BoundingBoxIntersectKeyHandler* key_handler,

     const int64_t num_elems) {

   cuda_kernel_launch_wrapper(

       approximate_distinct_tuples_impl_gpu<BoundingBoxIntersectKeyHandler>,

       hll_buffer,

       row_counts_buffer,

       b,

       num_elems,

       key_handler);


   auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);

   thrust::inclusive_scan(

       row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);

 }


 void approximate_distinct_tuples_on_device_range(uint8_t* hll_buffer,

                                                  const uint32_t b,

                                                  int32_t* row_counts_buffer,

                                                  const RangeKeyHandler* key_handler,

                                                  const size_t num_elems,

                                                  const size_t block_size_x,

                                                  const size_t grid_size_x) {

   auto qe_cuda_stream = getQueryEngineCudaStream();

   approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(

       hll_buffer, row_counts_buffer, b, num_elems, key_handler);

   checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));


   auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);

   thrust::inclusive_scan(

       row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);

 }


 void approximate_distinct_tuples_on_device(uint8_t* hll_buffer,

                                            const uint32_t b,

                                            const GenericKeyHandler* key_handler,

                                            const int64_t num_elems) {

   cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,

                              hll_buffer,

                              nullptr,

                              b,

                              num_elems,

                              key_handler);

 }


 void compute_bucket_sizes_on_device(double* bucket_sizes_buffer,

                                     const JoinColumn* join_column,

                                     const JoinColumnTypeInfo* type_info,

                                     const double* bucket_sz_threshold) {

   cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,

                              bucket_sizes_buffer,

                              join_column,

                              type_info,

                              bucket_sz_threshold);

 }

OneToManyPerfectJoinHashTableFillFuncArgs::for_window_framing
const bool for_window_framing
Definition: HashJoinRuntime.h:187

heavydb.dtypes.T
T
Definition: dtypes.py:8

OneToOnePerfectJoinHashTableFillFuncArgs
Definition: HashJoinRuntime.h:167

fill_hash_join_buff_on_device_sharded
void fill_hash_join_buff_on_device_sharded(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
Definition: HashJoinRuntimeGpu.cu:102

OneToOnePerfectJoinHashTableFillFuncArgs::type_info
const JoinColumnTypeInfo type_info
Definition: HashJoinRuntime.h:173

fill_baseline_hash_join_buff_wrapper
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
Definition: HashJoinRuntimeGpu.cu:340

run_benchmark_import.args
tuple args
Definition: run_benchmark_import.py:247

OneToOnePerfectJoinHashTableFillFuncArgs::buff
int32_t * buff
Definition: HashJoinRuntime.h:168

count_matches_sharded
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:764

OneToOnePerfectJoinHashTableFillFuncArgs::bucket_normalization
const int64_t bucket_normalization
Definition: HashJoinRuntime.h:176

fill_one_to_many_baseline_hash_table_on_device
void fill_one_to_many_baseline_hash_table_on_device(int32_t *buff, const SIZE *composite_key_dict, const size_t hash_entry_count, const size_t key_component_count, const KEY_HANDLER *key_handler, const size_t num_elems, const bool for_window_framing)
Definition: BaselineHashTableBuilder.h:208

fill_one_to_many_hash_table_on_device
void fill_one_to_many_hash_table_on_device(OneToManyPerfectJoinHashTableFillFuncArgs const args)
Definition: HashJoinRuntimeGpu.cu:175

CUstream
void * CUstream
Definition: nocuda.h:23

fill_hash_join_buff_on_device
void fill_hash_join_buff_on_device(OneToOnePerfectJoinHashTableFillFuncArgs const args)
Definition: HashJoinRuntimeGpu.cu:57

fill_hash_join_buff_wrapper_sharded_bucketized
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
Definition: HashJoinRuntimeGpu.cu:61

init_baseline_hash_join_buff_on_device_64
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
Definition: HashJoinRuntimeGpu.cu:326

fill_baseline_hash_join_buff_on_device_32
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
Definition: HashJoinRuntimeGpu.cu:362

ShardInfo
Definition: HashJoinRuntime.h:160

OneToManyPerfectJoinHashTableFillFuncArgs::buff
int32_t * buff
Definition: HashJoinRuntime.h:180

fill_hash_join_buff_bitwise_eq
DEVICE int SUFFIX() fill_hash_join_buff_bitwise_eq(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
Definition: HashJoinRuntime.cpp:187

OneToManyPerfectJoinHashTableFillFuncArgs::bucket_normalization
const int64_t bucket_normalization
Definition: HashJoinRuntime.h:186

SUFFIX
#define SUFFIX(name)
Definition: funcannotations.h:71

range_fill_baseline_hash_join_buff_on_device_64
void range_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
Definition: HashJoinRuntimeGpu.cu:429

OneToOnePerfectJoinHashTableFillFuncArgs::join_column
const JoinColumn join_column
Definition: HashJoinRuntime.h:172

inclusive_scan
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
Definition: HashJoinRuntime.cpp:1431

init_baseline_hash_join_buff
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:347

fill_hash_join_buff_bucketized_wrapper
__global__ void fill_hash_join_buff_bucketized_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)
Definition: HashJoinRuntimeGpu.cu:46

fill_hash_join_buff
DEVICE int SUFFIX() fill_hash_join_buff(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:203

compute_bucket_sizes_on_device
void compute_bucket_sizes_on_device(double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
Definition: HashJoinRuntimeGpu.cu:549

init_baseline_hash_join_buff_on_device_32
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
Definition: HashJoinRuntimeGpu.cu:313

approximate_distinct_tuples_on_device_range
void approximate_distinct_tuples_on_device_range(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
Definition: HashJoinRuntimeGpu.cu:520

VALID_POS_FLAG
#define VALID_POS_FLAG
Definition: HashJoinRuntimeGpu.cu:121

OneToManyPerfectJoinHashTableFillFuncArgs::join_column
const JoinColumn join_column
Definition: HashJoinRuntime.h:182

OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info
const BucketizedHashEntryInfo hash_entry_info
Definition: HashJoinRuntime.h:181

fill_one_to_many_hash_table_on_device_impl
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
Definition: HashJoinRuntimeGpu.cu:148

getQueryEngineCudaStream
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3

fill_one_to_many_baseline_hash_table_on_device_32
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
Definition: HashJoinRuntimeGpu.cu:450

fill_row_ids_sharded
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1171

BoundingBoxIntersectKeyHandler
Definition: HashJoinKeyHandlers.h:133

bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64
void bbox_intersect_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
Definition: HashJoinRuntimeGpu.cu:481

count_matches
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:702

HashJoinRuntime.cpp

fill_one_to_many_baseline_hash_table_on_device_64
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
Definition: HashJoinRuntimeGpu.cu:466

init_hash_join_buff_on_device
void init_hash_join_buff_on_device(int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
Definition: HashJoinRuntimeGpu.cu:114

approximate_distinct_tuples_on_device_bbox_intersect
void approximate_distinct_tuples_on_device_bbox_intersect(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
Definition: HashJoinRuntimeGpu.cu:501

fill_baseline_hash_join_buff
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: BaselineHashTableBuilder.h:30

range_fill_one_to_many_baseline_hash_table_on_device_64
void range_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
Definition: HashJoinRuntimeGpu.cu:491

fill_row_ids_bucketized
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
Definition: HashJoinRuntime.cpp:1075

init_baseline_hash_join_buff_wrapper
__global__ void init_baseline_hash_join_buff_wrapper(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
Definition: HashJoinRuntimeGpu.cu:299

JoinColumnTypeInfo::uses_bw_eq
const bool uses_bw_eq
Definition: HashJoinRuntime.h:142

BucketizedHashEntryInfo::getNormalizedHashEntryCount
size_t getNormalizedHashEntryCount() const
Definition: HashJoinRuntime.h:47

OneToOnePerfectJoinHashTableFillFuncArgs::for_semi_join
const bool for_semi_join
Definition: HashJoinRuntime.h:171

OneToOnePerfectJoinHashTableFillFuncArgs::dev_err_buff
int32_t * dev_err_buff
Definition: HashJoinRuntime.h:169

set_valid_pos_flag
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
Definition: HashJoinRuntimeGpu.cu:123

init_hash_join_buff
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:71

JoinColumnTypeInfo
Definition: HashJoinRuntime.h:137

fill_hash_join_buff_on_device_bucketized
void fill_hash_join_buff_on_device_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args)
Definition: HashJoinRuntimeGpu.cu:52

cuda_kernel_launch_wrapper
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
Definition: HashJoinRuntimeGpu.cu:28

bbox_intersect_fill_baseline_hash_join_buff_on_device_64
void bbox_intersect_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems)
Definition: HashJoinRuntimeGpu.cu:406

approximate_distinct_tuples_on_device
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
Definition: HashJoinRuntimeGpu.cu:537

fill_row_ids
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1027

checkCudaErrors
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

OneToOnePerfectJoinHashTableFillFuncArgs::invalid_slot_val
const int32_t invalid_slot_val
Definition: HashJoinRuntime.h:170

OneToManyPerfectJoinHashTableFillFuncArgs::type_info
const JoinColumnTypeInfo type_info
Definition: HashJoinRuntime.h:183

OneToManyPerfectJoinHashTableFillFuncArgs
Definition: HashJoinRuntime.h:179

fill_baseline_hash_join_buff_on_device_64
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
Definition: HashJoinRuntimeGpu.cu:384

fill_hash_join_buff_bucketized
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
Definition: HashJoinRuntime.cpp:167

count_matches_bucketized
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
Definition: HashJoinRuntime.cpp:730

fill_hash_join_buff_sharded_bucketized
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
Definition: HashJoinRuntime.cpp:272

fill_hash_join_buff_wrapper_sharded
__global__ void fill_hash_join_buff_wrapper_sharded(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
Definition: HashJoinRuntimeGpu.cu:79

fill_one_to_many_hash_table_on_device_sharded
void fill_one_to_many_hash_table_on_device_sharded(OneToManyPerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
Definition: HashJoinRuntimeGpu.cu:226

fill_hash_join_buff_sharded
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:311

init_hash_join_buff_wrapper
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
Definition: HashJoinRuntimeGpu.cu:108

fill_hash_join_buff_on_device_sharded_bucketized
void fill_hash_join_buff_on_device_sharded_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, ShardInfo const shard_info)
Definition: HashJoinRuntimeGpu.cu:95

BucketizedHashEntryInfo::bucketized_hash_entry_count
size_t bucketized_hash_entry_count
Definition: HashJoinRuntime.h:44

set_valid_pos
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
Definition: HashJoinRuntimeGpu.cu:135

JoinColumn
Definition: HashJoinRuntime.h:128

RangeKeyHandler
Definition: HashJoinKeyHandlers.h:187

fill_hash_join_buff_wrapper
__global__ void fill_hash_join_buff_wrapper(OneToOnePerfectJoinHashTableFillFuncArgs const args)
Definition: HashJoinRuntimeGpu.cu:37

GenericKeyHandler
Definition: HashJoinKeyHandlers.h:39

fill_one_to_many_hash_table_on_device_bucketized
void fill_one_to_many_hash_table_on_device_bucketized(OneToManyPerfectJoinHashTableFillFuncArgs const args)
Definition: HashJoinRuntimeGpu.cu:199