OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.h File Reference
#include <cstddef>
#include <cstdint>
#include <vector>
#include "../../../Shared/SqlTypesLayout.h"
#include "../../../Shared/sqltypes.h"
#include "../../RuntimeFunctions.h"
#include "../../../Shared/funcannotations.h"
+ Include dependency graph for HashJoinRuntime.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  BucketizedHashEntryInfo
 
struct  JoinChunk
 
struct  JoinColumn
 
struct  JoinColumnTypeInfo
 
struct  JoinBucketInfo
 
struct  ShardInfo
 

Enumerations

enum  ColumnType { SmallDate = 0, Signed = 1, Unsigned = 2, Double = 3 }
 

Functions

void init_hash_join_buff (int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
ColumnType get_join_column_type_kind (const SQLTypeInfo &ti)
 
int fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
int fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void fill_hash_join_buff_on_device (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
 
void fill_hash_join_buff_on_device_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
 
void fill_hash_join_buff_on_device_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
 
void fill_one_to_many_hash_table (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, const bool for_window_framing)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_sharded_bucketized (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_on_device (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const bool for_window_framing)
 
void fill_one_to_many_hash_table_on_device_bucketized (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_sharded (int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join=false, const bool is_geo_compressed=false, const bool for_window_framing=false)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join=false, const bool is_geo_compressed=false, const bool for_window_framing=false)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
 
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_overlaps (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 
void approximate_distinct_tuples_on_device_range (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
 

Variables

const size_t g_maximum_conditions_to_coalesce {8}
 

Enumeration Type Documentation

enum ColumnType
Enumerator
SmallDate 
Signed 
Unsigned 
Double 

Definition at line 120 of file HashJoinRuntime.h.

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2196 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2201  {
2202  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2203  CHECK(!join_column_per_key.empty());
2204 
2205  std::vector<std::future<void>> approx_distinct_threads;
2206  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2207  approx_distinct_threads.push_back(std::async(
2209  [&join_column_per_key,
2210  &type_info_per_key,
2211  b,
2212  hll_buffer_all_cpus,
2213  padded_size_bytes,
2214  thread_idx,
2215  thread_count] {
2216  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2217 
2218  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2219  false,
2220  &join_column_per_key[0],
2221  &type_info_per_key[0],
2222  nullptr,
2223  nullptr);
2225  nullptr,
2226  b,
2227  join_column_per_key[0].num_elems,
2228  &key_handler,
2229  thread_idx,
2230  thread_count);
2231  }));
2232  }
2233  for (auto& child : approx_distinct_threads) {
2234  child.get();
2235  }
2236 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 625 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

628  {
629  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
630  hll_buffer,
631  nullptr,
632  b,
633  num_elems,
634  key_handler);
635 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_overlaps ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 591 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

595  {
596  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
597  hll_buffer,
598  row_counts_buffer,
599  b,
600  num_elems,
601  key_handler);
602 
603  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
605  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
606 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 608 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, getQueryEngineCudaStream(), and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

614  {
615  auto qe_cuda_stream = getQueryEngineCudaStream();
616  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
617  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
618  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
619 
620  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
622  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
623 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2238 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

2246  {
2247  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2248  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2249  CHECK(!join_column_per_key.empty());
2250 
2251  std::vector<std::future<void>> approx_distinct_threads;
2252  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2253  approx_distinct_threads.push_back(std::async(
2255  [&join_column_per_key,
2256  &join_buckets_per_key,
2257  &row_counts,
2258  b,
2259  hll_buffer_all_cpus,
2260  padded_size_bytes,
2261  thread_idx,
2262  thread_count] {
2263  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2264 
2265  const auto key_handler = OverlapsKeyHandler(
2266  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2267  &join_column_per_key[0],
2268  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2270  row_counts.data(),
2271  b,
2272  join_column_per_key[0].num_elems,
2273  &key_handler,
2274  thread_idx,
2275  thread_count);
2276  }));
2277  }
2278  for (auto& child : approx_distinct_threads) {
2279  child.get();
2280  }
2281 
2283  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2284 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2286 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2295  {
2296  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2297  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2298  CHECK(!join_column_per_key.empty());
2299 
2300  std::vector<std::future<void>> approx_distinct_threads;
2301  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2302  approx_distinct_threads.push_back(std::async(
2304  [&join_column_per_key,
2305  &join_buckets_per_key,
2306  &row_counts,
2307  b,
2308  hll_buffer_all_cpus,
2309  padded_size_bytes,
2310  thread_idx,
2311  is_compressed,
2312  thread_count] {
2313  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2314 
2315  const auto key_handler = RangeKeyHandler(
2316  is_compressed,
2317  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2318  &join_column_per_key[0],
2319  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2321  row_counts.data(),
2322  b,
2323  join_column_per_key[0].num_elems,
2324  &key_handler,
2325  thread_idx,
2326  thread_count);
2327  }));
2328  }
2329  for (auto& child : approx_distinct_threads) {
2330  child.get();
2331  }
2332 
2334  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2335 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2337 of file HashJoinRuntime.cpp.

References threading_serial::async().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

2341  {
2342  std::vector<std::vector<double>> bucket_sizes_for_threads;
2343  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2344  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2345  }
2346  std::vector<std::future<void>> threads;
2347  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2348  threads.push_back(std::async(std::launch::async,
2349  compute_bucket_sizes_impl<2>,
2350  bucket_sizes_for_threads[thread_idx].data(),
2351  &join_column,
2352  &type_info,
2353  bucket_size_thresholds.data(),
2354  thread_idx,
2355  thread_count));
2356  }
2357  for (auto& child : threads) {
2358  child.get();
2359  }
2360 
2361  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2362  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2363  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2364  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2365  }
2366  }
2367  }
2368 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds 
)

Definition at line 637 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

640  {
641  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
642  bucket_sizes_buffer,
643  join_column,
644  type_info,
645  bucket_sz_threshold);
646 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1795 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1804  {
1805  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1806  entry_count,
1807  invalid_slot_val,
1808  for_semi_join,
1809  key_component_count,
1810  with_val_slot,
1811  key_handler,
1812  num_elems,
1813  cpu_thread_idx,
1814  cpu_thread_count);
1815 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1859 of file HashJoinRuntime.cpp.

1868  {
1869  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1870  entry_count,
1871  invalid_slot_val,
1872  for_semi_join,
1873  key_component_count,
1874  with_val_slot,
1875  key_handler,
1876  num_elems,
1877  cpu_thread_idx,
1878  cpu_thread_count);
1879 }
void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 453 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

461  {
463  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
464  hash_buff,
465  entry_count,
466  invalid_slot_val,
467  for_semi_join,
468  key_component_count,
469  with_val_slot,
470  dev_err_buff,
471  key_handler,
472  num_elems);
473 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 475 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

483  {
485  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
486  hash_buff,
487  entry_count,
488  invalid_slot_val,
489  for_semi_join,
490  key_component_count,
491  with_val_slot,
492  dev_err_buff,
493  key_handler,
494  num_elems);
495 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

int fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 199 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

207  {
208  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
209  : SUFFIX(fill_one_to_one_hashtable);
210  auto hashtable_filling_func = [&](auto elem, size_t index) {
211  auto entry_ptr = SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
212  return filling_func(index, entry_ptr, invalid_slot_val);
213  };
214 
215  return fill_hash_join_buff_impl(buff,
216  join_column,
217  type_info,
218  sd_inner_to_outer_translation_map,
219  min_inner_elem,
220  cpu_thread_idx,
221  cpu_thread_count,
222  hashtable_filling_func);
223 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 166 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

176  {
177  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
178  : SUFFIX(fill_one_to_one_hashtable);
179  auto hashtable_filling_func = [&](auto elem, size_t index) {
180  auto entry_ptr =
182  elem,
183  type_info.min_val / bucket_normalization,
184  type_info.translated_null_val,
185  bucket_normalization);
186  return filling_func(index, entry_ptr, invalid_slot_val);
187  };
188 
189  return fill_hash_join_buff_impl(buff,
190  join_column,
191  type_info,
192  sd_inner_to_outer_translation_map,
193  min_inner_elem,
194  cpu_thread_idx,
195  cpu_thread_count,
196  hashtable_filling_func);
197 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
const int64_t translated_null_val
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info 
)

Definition at line 85 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

90  {
92  buff,
93  invalid_slot_val,
94  for_semi_join,
95  join_column,
96  type_info,
97  dev_err_buff);
98 }
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int64_t  bucket_normalization 
)

Definition at line 68 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

74  {
76  buff,
77  invalid_slot_val,
78  for_semi_join,
79  join_column,
80  type_info,
81  dev_err_buff,
82  bucket_normalization);
83 }
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info 
)

Definition at line 163 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

169  {
171  buff,
172  invalid_slot_val,
173  for_semi_join,
174  join_column,
175  type_info,
176  shard_info,
177  dev_err_buff);
178 }
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int64_t  bucket_normalization 
)

Definition at line 143 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

151  {
153  buff,
154  invalid_slot_val,
155  for_semi_join,
156  join_column,
157  type_info,
158  shard_info,
159  dev_err_buff,
160  bucket_normalization);
161 }
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join = false,
const bool  is_geo_compressed = false,
const bool  for_window_framing = false 
)

Definition at line 2138 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2151  {
2152  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2153  composite_key_dict,
2154  hash_entry_count,
2155  key_component_count,
2156  join_column_per_key,
2157  type_info_per_key,
2158  join_bucket_info,
2159  sd_inner_to_outer_translation_maps,
2160  sd_min_inner_elems,
2161  cpu_thread_count,
2162  is_range_join,
2163  is_geo_compressed,
2164  for_window_framing);
2165 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join = false,
const bool  is_geo_compressed = false,
const bool  for_window_framing = false 
)

Definition at line 2167 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2180  {
2181  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2182  composite_key_dict,
2183  hash_entry_count,
2184  key_component_count,
2185  join_column_per_key,
2186  type_info_per_key,
2187  join_bucket_info,
2188  sd_inner_to_outer_translation_maps,
2189  sd_min_inner_elems,
2190  cpu_thread_count,
2191  is_range_join,
2192  is_geo_compressed,
2193  for_window_framing);
2194 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const bool  for_window_framing 
)

Definition at line 540 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

547  {
548  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
549  composite_key_dict,
550  hash_entry_count,
551  key_handler,
552  num_elems,
553  for_window_framing);
554 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const bool  for_window_framing 
)

Definition at line 556 of file HashJoinRuntimeGpu.cu.

562  {
563  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
564  composite_key_dict,
565  hash_entry_count,
566  key_handler,
567  num_elems,
568  for_window_framing);
569 }
void fill_one_to_many_hash_table ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count,
const bool  for_window_framing 
)

Definition at line 1490 of file HashJoinRuntime.cpp.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1497  {
1498  auto timer = DEBUG_TIMER(__func__);
1499  auto launch_count_matches =
1500  [count_buff = buff + hash_entry_info.bucketized_hash_entry_count,
1501  &join_column,
1502  &type_info,
1503  sd_inner_to_outer_translation_map,
1504  min_inner_elem](auto cpu_thread_idx, auto cpu_thread_count) {
1506  (count_buff,
1507  join_column,
1508  type_info,
1509  sd_inner_to_outer_translation_map,
1510  min_inner_elem,
1511  cpu_thread_idx,
1512  cpu_thread_count);
1513  };
1514  auto launch_fill_row_ids =
1515  [hash_entry_count = hash_entry_info.bucketized_hash_entry_count,
1516  buff,
1517  &join_column,
1518  &type_info,
1519  sd_inner_to_outer_translation_map,
1520  min_inner_elem,
1521  for_window_framing](auto cpu_thread_idx, auto cpu_thread_count) {
1523  (buff,
1524  hash_entry_count,
1525  join_column,
1526  type_info,
1527  for_window_framing,
1528  sd_inner_to_outer_translation_map,
1529  min_inner_elem,
1530  cpu_thread_idx,
1531  cpu_thread_count);
1532  };
1533 
1535  hash_entry_info.bucketized_hash_entry_count,
1536  join_column,
1537  type_info,
1538  sd_inner_to_outer_translation_map,
1539  min_inner_elem,
1540  cpu_thread_count,
1541  for_window_framing,
1542  launch_count_matches,
1543  launch_fill_row_ids);
1544 }
#define SUFFIX(name)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define DEBUG_TIMER(name)
Definition: Logger.h:411
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
size_t bucketized_hash_entry_count

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1546 of file HashJoinRuntime.cpp.

References BucketizedHashEntryInfo::bucket_normalization, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), BucketizedHashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1553  {
1554  auto timer = DEBUG_TIMER(__func__);
1555  auto bucket_normalization = hash_entry_info.bucket_normalization;
1556  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1557  auto launch_count_matches = [bucket_normalization,
1558  count_buff = buff + hash_entry_count,
1559  &join_column,
1560  &type_info,
1561  sd_inner_to_outer_translation_map,
1562  min_inner_elem](auto cpu_thread_idx,
1563  auto cpu_thread_count) {
1565  (count_buff,
1566  join_column,
1567  type_info,
1568  sd_inner_to_outer_translation_map,
1569  min_inner_elem,
1570  cpu_thread_idx,
1571  cpu_thread_count,
1572  bucket_normalization);
1573  };
1574  auto launch_fill_row_ids = [bucket_normalization,
1575  hash_entry_count,
1576  buff,
1577  &join_column,
1578  &type_info,
1579  sd_inner_to_outer_translation_map,
1580  min_inner_elem](auto cpu_thread_idx,
1581  auto cpu_thread_count) {
1583  (buff,
1584  hash_entry_count,
1585  join_column,
1586  type_info,
1587  sd_inner_to_outer_translation_map,
1588  min_inner_elem,
1589  cpu_thread_idx,
1590  cpu_thread_count,
1591  bucket_normalization);
1592  };
1593 
1595  hash_entry_count,
1596  join_column,
1597  type_info,
1598  sd_inner_to_outer_translation_map,
1599  min_inner_elem,
1600  cpu_thread_count,
1601  false,
1602  launch_count_matches,
1603  launch_fill_row_ids);
1604 }
#define SUFFIX(name)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const
#define DEBUG_TIMER(name)
Definition: Logger.h:411
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const bool  for_window_framing 
)

Definition at line 247 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), and SUFFIX.

251  {
252  auto hash_entry_count = hash_entry_info.bucketized_hash_entry_count;
253  auto count_matches_func = [count_buff = buff + hash_entry_count,
254  join_column,
255  type_info] {
256  cuda_kernel_launch_wrapper(SUFFIX(count_matches), count_buff, join_column, type_info);
257  };
258 
259  auto fill_row_ids_func =
260  [buff, hash_entry_count, join_column, type_info, for_window_framing] {
262  buff,
263  hash_entry_count,
264  join_column,
265  type_info,
266  for_window_framing);
267  };
268 
270  hash_entry_count,
271  join_column,
272  type_info,
273  count_matches_func,
274  fill_row_ids_func);
275 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
size_t bucketized_hash_entry_count

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 277 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucket_normalization, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), BucketizedHashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

281  {
282  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
283  auto count_matches_func = [count_buff = buff + hash_entry_count,
284  join_column,
285  type_info,
286  bucket_normalization =
287  hash_entry_info.bucket_normalization] {
289  count_buff,
290  join_column,
291  type_info,
292  bucket_normalization);
293  };
294 
295  auto fill_row_ids_func = [buff,
296  hash_entry_count =
297  hash_entry_info.getNormalizedHashEntryCount(),
298  join_column,
299  type_info,
300  bucket_normalization = hash_entry_info.bucket_normalization] {
302  buff,
303  hash_entry_count,
304  join_column,
305  type_info,
306  bucket_normalization);
307  };
308 
310  hash_entry_count,
311  join_column,
312  type_info,
313  count_matches_func,
314  fill_row_ids_func);
315 }
#define SUFFIX(name)
int64_t bucket_normalization
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info 
)

Definition at line 317 of file HashJoinRuntimeGpu.cu.

References BucketizedHashEntryInfo::bucketized_hash_entry_count, checkCudaErrors, count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), getQueryEngineCudaStream(), inclusive_scan(), set_valid_pos(), set_valid_pos_flag(), and SUFFIX.

322  {
323  auto hash_entry_count = hash_entry_info.bucketized_hash_entry_count;
324  int32_t* pos_buff = buff;
325  int32_t* count_buff = buff + hash_entry_count;
326  auto qe_cuda_stream = getQueryEngineCudaStream();
328  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
329  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
331  SUFFIX(count_matches_sharded), count_buff, join_column, type_info, shard_info);
332 
333  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
334 
335  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
337  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
338  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
340  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
341  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
343  buff,
344  hash_entry_count,
345  join_column,
346  type_info,
347  shard_info);
348 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
size_t bucketized_hash_entry_count
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_sharded_bucketized ( int32_t *  buff,
const BucketizedHashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)
ColumnType get_join_column_type_kind ( const SQLTypeInfo ti)
inline

Definition at line 147 of file HashJoinRuntime.h.

References SQLTypeInfo::is_date_in_days(), is_unsigned_type(), Signed, SmallDate, and Unsigned.

Referenced by OverlapsJoinHashTable::fetchColumnsForDevice(), PerfectJoinHashTable::fetchColumnsForDevice(), BaselineJoinHashTable::fetchColumnsForDevice(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

147  {
148  if (ti.is_date_in_days()) {
149  return SmallDate;
150  } else {
151  return is_unsigned_type(ti) ? Unsigned : Signed;
152  }
153 }
bool is_date_in_days() const
Definition: sqltypes.h:988
bool is_unsigned_type(const SQLTypeInfo &ti)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1739 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1745  {
1746  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1747  entry_count,
1748  key_component_count,
1749  with_val_slot,
1750  invalid_slot_val,
1751  cpu_thread_idx,
1752  cpu_thread_count);
1753 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1755 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1761  {
1762  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1763  entry_count,
1764  key_component_count,
1765  with_val_slot,
1766  invalid_slot_val,
1767  cpu_thread_idx,
1768  cpu_thread_count);
1769 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 404 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

408  {
409  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
410  hash_join_buff,
411  entry_count,
412  key_component_count,
413  with_val_slot,
414  invalid_slot_val);
415 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 417 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

421  {
422  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
423  hash_join_buff,
424  entry_count,
425  key_component_count,
426  with_val_slot,
427  invalid_slot_val);
428 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_hash_join_buff ( int32_t *  buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 70 of file HashJoinRuntime.cpp.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

74  {
75 #ifdef __CUDACC__
76  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
77  int32_t step = blockDim.x * gridDim.x;
78 #else
79  int32_t start = cpu_thread_idx;
80  int32_t step = cpu_thread_count;
81 #endif
82  for (int64_t i = start; i < hash_entry_count; i += step) {
83  groups_buffer[i] = invalid_slot_val;
84  }
85 }

+ Here is the caller graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 186 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

188  {
190  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
191 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1817 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1825  {
1826  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1827  entry_count,
1828  invalid_slot_val,
1829  false,
1830  key_component_count,
1831  with_val_slot,
1832  key_handler,
1833  num_elems,
1834  cpu_thread_idx,
1835  cpu_thread_count);
1836 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1881 of file HashJoinRuntime.cpp.

1889  {
1890  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1891  entry_count,
1892  invalid_slot_val,
1893  false,
1894  key_component_count,
1895  with_val_slot,
1896  key_handler,
1897  num_elems,
1898  cpu_thread_idx,
1899  cpu_thread_count);
1900 }
void overlaps_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 497 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

505  {
507  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
508  hash_buff,
509  entry_count,
510  invalid_slot_val,
511  false,
512  key_component_count,
513  with_val_slot,
514  dev_err_buff,
515  key_handler,
516  num_elems);
517 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 571 of file HashJoinRuntimeGpu.cu.

576  {
577  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
578  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
579 }
int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1838 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1846  {
1847  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1848  entry_count,
1849  invalid_slot_val,
1850  false,
1851  key_component_count,
1852  with_val_slot,
1853  key_handler,
1854  num_elems,
1855  cpu_thread_idx,
1856  cpu_thread_count);
1857 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1902 of file HashJoinRuntime.cpp.

1910  {
1911  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1912  entry_count,
1913  invalid_slot_val,
1914  false,
1915  key_component_count,
1916  with_val_slot,
1917  key_handler,
1918  num_elems,
1919  cpu_thread_idx,
1920  cpu_thread_count);
1921 }
void range_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 519 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

526  {
528  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
529  hash_buff,
530  entry_count,
531  invalid_slot_val,
532  false,
533  key_component_count,
534  with_val_slot,
535  dev_err_buff,
536  key_handler,
537  num_elems);
538 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const size_t  hash_entry_count,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 581 of file HashJoinRuntimeGpu.cu.

586  {
587  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
588  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
589 }

Variable Documentation