OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntime.h File Reference
#include <cstddef>
#include <cstdint>
#include <vector>
#include "../../../Shared/SqlTypesLayout.h"
#include "../../../Shared/sqltypes.h"
#include "../../RuntimeFunctions.h"
#include "../../../Shared/funcannotations.h"
+ Include dependency graph for HashJoinRuntime.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  HashEntryInfo
 
struct  JoinChunk
 
struct  JoinColumn
 
struct  JoinColumnTypeInfo
 
struct  JoinBucketInfo
 
struct  ShardInfo
 

Enumerations

enum  ColumnType { SmallDate = 0, Signed = 1, Unsigned = 2, Double = 3 }
 

Functions

void init_hash_join_buff (int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_hash_join_buff_on_device (int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void init_baseline_hash_join_buff_on_device_32 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
void init_baseline_hash_join_buff_on_device_64 (int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
 
ColumnType get_join_column_type_kind (const SQLTypeInfo &ti)
 
int fill_hash_join_buff_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
 
int fill_hash_join_buff (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void fill_hash_join_buff_on_device (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
 
void fill_hash_join_buff_on_device_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
 
void fill_hash_join_buff_on_device_sharded (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
 
void fill_hash_join_buff_on_device_sharded_bucketized (int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
 
void fill_one_to_many_hash_table (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_sharded_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count)
 
void fill_one_to_many_hash_table_on_device (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_bucketized (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
 
void fill_one_to_many_hash_table_on_device_sharded (int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
 
int fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_32 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int overlaps_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const OverlapsKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
int range_fill_baseline_hash_join_buff_64 (int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
 
void fill_baseline_hash_join_buff_on_device_32 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_baseline_hash_join_buff_on_device_64 (int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void fill_one_to_many_baseline_hash_table_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join=false, const bool is_geo_compressed=false)
 
void fill_one_to_many_baseline_hash_table_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join=false, const bool is_geo_compressed=false)
 
void fill_one_to_many_baseline_hash_table_on_device_32 (int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void range_fill_one_to_many_baseline_hash_table_on_device_64 (int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
 
void approximate_distinct_tuples (uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
 
void approximate_distinct_tuples_overlaps (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
 
void approximate_distinct_tuples_range (uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
 
void approximate_distinct_tuples_on_device (uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
 
void approximate_distinct_tuples_on_device_overlaps (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
 
void compute_bucket_sizes_on_cpu (std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
 
void approximate_distinct_tuples_on_device_range (uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
 
void compute_bucket_sizes_on_device (double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
 

Variables

const size_t g_maximum_conditions_to_coalesce {8}
 

Enumeration Type Documentation

enum ColumnType
Enumerator
SmallDate 
Signed 
Unsigned 
Double 

Definition at line 129 of file HashJoinRuntime.h.

Function Documentation

void approximate_distinct_tuples ( uint8_t *  hll_buffer_all_cpus,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const int  thread_count 
)

Definition at line 2149 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, and CHECK_EQ.

Referenced by BaselineJoinHashTable::approximateTupleCount().

2154  {
2155  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2156  CHECK(!join_column_per_key.empty());
2157 
2158  std::vector<std::future<void>> approx_distinct_threads;
2159  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2160  approx_distinct_threads.push_back(std::async(
2162  [&join_column_per_key,
2163  &type_info_per_key,
2164  b,
2165  hll_buffer_all_cpus,
2166  padded_size_bytes,
2167  thread_idx,
2168  thread_count] {
2169  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2170 
2171  const auto key_handler = GenericKeyHandler(join_column_per_key.size(),
2172  false,
2173  &join_column_per_key[0],
2174  &type_info_per_key[0],
2175  nullptr,
2176  nullptr);
2178  nullptr,
2179  b,
2180  join_column_per_key[0].num_elems,
2181  &key_handler,
2182  thread_idx,
2183  thread_count);
2184  }));
2185  }
2186  for (auto& child : approx_distinct_threads) {
2187  child.get();
2188  }
2189 }
#define CHECK_EQ(x, y)
Definition: Logger.h:231
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:223

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device ( uint8_t *  hll_buffer,
const uint32_t  b,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 606 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTable::approximateTupleCount().

609  {
610  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
611  hll_buffer,
612  nullptr,
613  b,
614  num_elems,
615  key_handler);
616 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_overlaps ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 572 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

576  {
577  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
578  hll_buffer,
579  row_counts_buffer,
580  b,
581  num_elems,
582  key_handler);
583 
584  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
586  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
587 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_on_device_range ( uint8_t *  hll_buffer,
const uint32_t  b,
int32_t *  row_counts_buffer,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const size_t  block_size_x,
const size_t  grid_size_x 
)

Definition at line 589 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, getQueryEngineCudaStream(), and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

595  {
596  auto qe_cuda_stream = getQueryEngineCudaStream();
597  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
598  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
599  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
600 
601  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
603  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
604 }
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_overlaps ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const int  thread_count 
)

Definition at line 2191 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by OverlapsJoinHashTable::approximateTupleCount().

2199  {
2200  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2201  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2202  CHECK(!join_column_per_key.empty());
2203 
2204  std::vector<std::future<void>> approx_distinct_threads;
2205  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2206  approx_distinct_threads.push_back(std::async(
2208  [&join_column_per_key,
2209  &join_buckets_per_key,
2210  &row_counts,
2211  b,
2212  hll_buffer_all_cpus,
2213  padded_size_bytes,
2214  thread_idx,
2215  thread_count] {
2216  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2217 
2218  const auto key_handler = OverlapsKeyHandler(
2219  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2220  &join_column_per_key[0],
2221  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2223  row_counts.data(),
2224  b,
2225  join_column_per_key[0].num_elems,
2226  &key_handler,
2227  thread_idx,
2228  thread_count);
2229  }));
2230  }
2231  for (auto& child : approx_distinct_threads) {
2232  child.get();
2233  }
2234 
2236  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2237 }
#define CHECK_EQ(x, y)
Definition: Logger.h:231
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:223

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void approximate_distinct_tuples_range ( uint8_t *  hll_buffer_all_cpus,
std::vector< int32_t > &  row_counts,
const uint32_t  b,
const size_t  padded_size_bytes,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_buckets_per_key,
const bool  is_compressed,
const int  thread_count 
)

Definition at line 2239 of file HashJoinRuntime.cpp.

References approximate_distinct_tuples_impl(), threading_serial::async(), CHECK, CHECK_EQ, and inclusive_scan().

Referenced by RangeJoinHashTable::approximateTupleCount().

2248  {
2249  CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());
2250  CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());
2251  CHECK(!join_column_per_key.empty());
2252 
2253  std::vector<std::future<void>> approx_distinct_threads;
2254  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2255  approx_distinct_threads.push_back(std::async(
2257  [&join_column_per_key,
2258  &join_buckets_per_key,
2259  &row_counts,
2260  b,
2261  hll_buffer_all_cpus,
2262  padded_size_bytes,
2263  thread_idx,
2264  is_compressed,
2265  thread_count] {
2266  auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;
2267 
2268  const auto key_handler = RangeKeyHandler(
2269  is_compressed,
2270  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),
2271  &join_column_per_key[0],
2272  join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());
2274  row_counts.data(),
2275  b,
2276  join_column_per_key[0].num_elems,
2277  &key_handler,
2278  thread_idx,
2279  thread_count);
2280  }));
2281  }
2282  for (auto& child : approx_distinct_threads) {
2283  child.get();
2284  }
2285 
2287  row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);
2288 }
#define CHECK_EQ(x, y)
Definition: Logger.h:231
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
future< Result > async(Fn &&fn, Args &&...args)
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define CHECK(condition)
Definition: Logger.h:223

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_cpu ( std::vector< double > &  bucket_sizes_for_dimension,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const std::vector< double > &  bucket_size_thresholds,
const int  thread_count 
)

Definition at line 2290 of file HashJoinRuntime.cpp.

References threading_serial::async().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

2294  {
2295  std::vector<std::vector<double>> bucket_sizes_for_threads;
2296  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2297  bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);
2298  }
2299  std::vector<std::future<void>> threads;
2300  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2301  threads.push_back(std::async(std::launch::async,
2302  compute_bucket_sizes_impl<2>,
2303  bucket_sizes_for_threads[thread_idx].data(),
2304  &join_column,
2305  &type_info,
2306  bucket_size_thresholds.data(),
2307  thread_idx,
2308  thread_count));
2309  }
2310  for (auto& child : threads) {
2311  child.get();
2312  }
2313 
2314  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
2315  for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {
2316  if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {
2317  bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];
2318  }
2319  }
2320  }
2321 }
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void compute_bucket_sizes_on_device ( double *  bucket_sizes_buffer,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const double *  bucket_size_thresholds 
)

Definition at line 618 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes().

621  {
622  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
623  bucket_sizes_buffer,
624  join_column,
625  type_info,
626  bucket_sz_threshold);
627 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1758 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1767  {
1768  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1769  entry_count,
1770  invalid_slot_val,
1771  for_semi_join,
1772  key_component_count,
1773  with_val_slot,
1774  key_handler,
1775  num_elems,
1776  cpu_thread_idx,
1777  cpu_thread_count);
1778 }

+ Here is the caller graph for this function:

int fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
const GenericKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1822 of file HashJoinRuntime.cpp.

1831  {
1832  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1833  entry_count,
1834  invalid_slot_val,
1835  for_semi_join,
1836  key_component_count,
1837  with_val_slot,
1838  key_handler,
1839  num_elems,
1840  cpu_thread_idx,
1841  cpu_thread_count);
1842 }
void fill_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 444 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by fill_baseline_hash_join_buff_on_device().

452  {
454  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
455  hash_buff,
456  entry_count,
457  invalid_slot_val,
458  for_semi_join,
459  key_component_count,
460  with_val_slot,
461  dev_err_buff,
462  key_handler,
463  num_elems);
464 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 466 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

474  {
476  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
477  hash_buff,
478  entry_count,
479  invalid_slot_val,
480  for_semi_join,
481  key_component_count,
482  with_val_slot,
483  dev_err_buff,
484  key_handler,
485  num_elems);
486 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

int fill_hash_join_buff ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 195 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_wrapper().

203  {
204  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
205  : SUFFIX(fill_one_to_one_hashtable);
206  auto hashtable_filling_func = [&](auto elem, size_t index) {
207  auto entry_ptr = SUFFIX(get_hash_slot)(buff, elem, type_info.min_val);
208  return filling_func(index, entry_ptr, invalid_slot_val);
209  };
210 
211  return fill_hash_join_buff_impl(buff,
212  join_column,
213  type_info,
214  sd_inner_to_outer_translation_map,
215  min_inner_elem,
216  cpu_thread_idx,
217  cpu_thread_count,
218  hashtable_filling_func);
219 }
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:55
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:75
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:45

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int fill_hash_join_buff_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count,
const int64_t  bucket_normalization 
)

Definition at line 166 of file HashJoinRuntime.cpp.

References fill_hash_join_buff_impl(), fill_hashtable_for_semi_join(), fill_one_to_one_hashtable(), get_bucketized_hash_slot(), and SUFFIX.

Referenced by fill_hash_join_buff_bucketized_wrapper(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

176  {
177  auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)
178  : SUFFIX(fill_one_to_one_hashtable);
179  auto hashtable_filling_func = [&](auto elem, size_t index) {
180  auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(
181  buff, elem, type_info.min_val, bucket_normalization);
182  return filling_func(index, entry_ptr, invalid_slot_val);
183  };
184 
185  return fill_hash_join_buff_impl(buff,
186  join_column,
187  type_info,
188  sd_inner_to_outer_translation_map,
189  min_inner_elem,
190  cpu_thread_idx,
191  cpu_thread_count,
192  hashtable_filling_func);
193 }
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:67
#define SUFFIX(name)
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:55
DEVICE auto fill_hash_join_buff_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
const int64_t min_val
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:45

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_hash_join_buff_on_device ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info 
)

Definition at line 85 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper().

90  {
92  buff,
93  invalid_slot_val,
94  for_semi_join,
95  join_column,
96  type_info,
97  dev_err_buff);
98 }
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const int64_t  bucket_normalization 
)

Definition at line 68 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_bucketized_wrapper().

74  {
76  buff,
77  invalid_slot_val,
78  for_semi_join,
79  join_column,
80  type_info,
81  dev_err_buff,
82  bucket_normalization);
83 }
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info 
)

Definition at line 163 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded().

169  {
171  buff,
172  invalid_slot_val,
173  for_semi_join,
174  join_column,
175  type_info,
176  shard_info,
177  dev_err_buff);
178 }
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_hash_join_buff_on_device_sharded_bucketized ( int32_t *  buff,
const int32_t  invalid_slot_val,
const bool  for_semi_join,
int *  dev_err_buff,
const JoinColumn  join_column,
const JoinColumnTypeInfo  type_info,
const ShardInfo  shard_info,
const int64_t  bucket_normalization 
)

Definition at line 143 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and fill_hash_join_buff_wrapper_sharded_bucketized().

151  {
153  buff,
154  invalid_slot_val,
155  for_semi_join,
156  join_column,
157  type_info,
158  shard_info,
159  dev_err_buff,
160  bucket_normalization);
161 }
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_baseline_hash_table_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join = false,
const bool  is_geo_compressed = false 
)

Definition at line 2095 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2107  {
2108  fill_one_to_many_baseline_hash_table<int32_t>(buff,
2109  composite_key_dict,
2110  hash_entry_count,
2111  key_component_count,
2112  join_column_per_key,
2113  type_info_per_key,
2114  join_bucket_info,
2115  sd_inner_to_outer_translation_maps,
2116  sd_min_inner_elems,
2117  cpu_thread_count,
2118  is_range_join,
2119  is_geo_compressed);
2120 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const std::vector< JoinColumn > &  join_column_per_key,
const std::vector< JoinColumnTypeInfo > &  type_info_per_key,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const std::vector< const int32_t * > &  sd_inner_to_outer_translation_maps,
const std::vector< int32_t > &  sd_min_inner_elems,
const int32_t  cpu_thread_count,
const bool  is_range_join = false,
const bool  is_geo_compressed = false 
)

Definition at line 2122 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

2134  {
2135  fill_one_to_many_baseline_hash_table<int64_t>(buff,
2136  composite_key_dict,
2137  hash_entry_count,
2138  key_component_count,
2139  join_column_per_key,
2140  type_info_per_key,
2141  join_bucket_info,
2142  sd_inner_to_outer_translation_maps,
2143  sd_min_inner_elems,
2144  cpu_thread_count,
2145  is_range_join,
2146  is_geo_compressed);
2147 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_32 ( int32_t *  buff,
const int32_t *  composite_key_dict,
const int64_t  hash_entry_count,
const size_t  key_component_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 531 of file HashJoinRuntimeGpu.cu.

Referenced by fill_one_to_many_baseline_hash_table_on_device().

537  {
538  fill_one_to_many_baseline_hash_table_on_device<int32_t>(
539  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
540 }

+ Here is the caller graph for this function:

void fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const GenericKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 542 of file HashJoinRuntimeGpu.cu.

547  {
548  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
549  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
550 }
void fill_one_to_many_hash_table ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1458 of file HashJoinRuntime.cpp.

References count_matches(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1464  {
1465  auto timer = DEBUG_TIMER(__func__);
1466  auto launch_count_matches = [count_buff = buff + hash_entry_info.hash_entry_count,
1467  &join_column,
1468  &type_info,
1469  sd_inner_to_outer_translation_map,
1470  min_inner_elem](auto cpu_thread_idx,
1471  auto cpu_thread_count) {
1473  (count_buff,
1474  join_column,
1475  type_info,
1476  sd_inner_to_outer_translation_map,
1477  min_inner_elem,
1478  cpu_thread_idx,
1479  cpu_thread_count);
1480  };
1481  auto launch_fill_row_ids = [hash_entry_count = hash_entry_info.hash_entry_count,
1482  buff,
1483  &join_column,
1484  &type_info,
1485  sd_inner_to_outer_translation_map,
1486  min_inner_elem](auto cpu_thread_idx,
1487  auto cpu_thread_count) {
1489  (buff,
1490  hash_entry_count,
1491  join_column,
1492  type_info,
1493  sd_inner_to_outer_translation_map,
1494  min_inner_elem,
1495  cpu_thread_idx,
1496  cpu_thread_count);
1497  };
1498 
1500  hash_entry_info.hash_entry_count,
1501  join_column,
1502  type_info,
1503  sd_inner_to_outer_translation_map,
1504  min_inner_elem,
1505  cpu_thread_count,
1506  launch_count_matches,
1507  launch_fill_row_ids);
1508 }
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
size_t hash_entry_count
#define DEBUG_TIMER(name)
Definition: Logger.h:370

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)

Definition at line 1510 of file HashJoinRuntime.cpp.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), DEBUG_TIMER, fill_one_to_many_hash_table_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

Referenced by PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu().

1517  {
1518  auto timer = DEBUG_TIMER(__func__);
1519  auto bucket_normalization = hash_entry_info.bucket_normalization;
1520  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
1521  auto launch_count_matches = [bucket_normalization,
1522  count_buff = buff + hash_entry_count,
1523  &join_column,
1524  &type_info,
1525  sd_inner_to_outer_translation_map,
1526  min_inner_elem](auto cpu_thread_idx,
1527  auto cpu_thread_count) {
1529  (count_buff,
1530  join_column,
1531  type_info,
1532  sd_inner_to_outer_translation_map,
1533  min_inner_elem,
1534  cpu_thread_idx,
1535  cpu_thread_count,
1536  bucket_normalization);
1537  };
1538  auto launch_fill_row_ids = [bucket_normalization,
1539  hash_entry_count,
1540  buff,
1541  &join_column,
1542  &type_info,
1543  sd_inner_to_outer_translation_map,
1544  min_inner_elem](auto cpu_thread_idx,
1545  auto cpu_thread_count) {
1547  (buff,
1548  hash_entry_count,
1549  join_column,
1550  type_info,
1551  sd_inner_to_outer_translation_map,
1552  min_inner_elem,
1553  cpu_thread_idx,
1554  cpu_thread_count,
1555  bucket_normalization);
1556  };
1557 
1559  hash_entry_count,
1560  join_column,
1561  type_info,
1562  sd_inner_to_outer_translation_map,
1563  min_inner_elem,
1564  cpu_thread_count,
1565  launch_count_matches,
1566  launch_fill_row_ids);
1567 }
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const unsigned cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
#define SUFFIX(name)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
size_t getNormalizedHashEntryCount() const
#define DEBUG_TIMER(name)
Definition: Logger.h:370
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void fill_one_to_many_hash_table_on_device ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 247 of file HashJoinRuntimeGpu.cu.

References count_matches(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids(), HashEntryInfo::hash_entry_count, and SUFFIX.

250  {
251  auto hash_entry_count = hash_entry_info.hash_entry_count;
252  auto count_matches_func = [count_buff = buff + hash_entry_count,
253  join_column,
254  type_info] {
255  cuda_kernel_launch_wrapper(SUFFIX(count_matches), count_buff, join_column, type_info);
256  };
257 
258  auto fill_row_ids_func = [buff, hash_entry_count, join_column, type_info] {
260  SUFFIX(fill_row_ids), buff, hash_entry_count, join_column, type_info);
261  };
262 
264  hash_entry_count,
265  join_column,
266  type_info,
267  count_matches_func,
268  fill_row_ids_func);
269 }
#define SUFFIX(name)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
size_t hash_entry_count
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info 
)

Definition at line 271 of file HashJoinRuntimeGpu.cu.

References HashEntryInfo::bucket_normalization, count_matches_bucketized(), cuda_kernel_launch_wrapper(), fill_one_to_many_hash_table_on_device_impl(), fill_row_ids_bucketized(), HashEntryInfo::getNormalizedHashEntryCount(), and SUFFIX.

275  {
276  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
277  auto count_matches_func = [count_buff = buff + hash_entry_count,
278  join_column,
279  type_info,
280  bucket_normalization =
281  hash_entry_info.bucket_normalization] {
283  count_buff,
284  join_column,
285  type_info,
286  bucket_normalization);
287  };
288 
289  auto fill_row_ids_func = [buff,
290  hash_entry_count =
291  hash_entry_info.getNormalizedHashEntryCount(),
292  join_column,
293  type_info,
294  bucket_normalization = hash_entry_info.bucket_normalization] {
296  buff,
297  hash_entry_count,
298  join_column,
299  type_info,
300  bucket_normalization);
301  };
302 
304  hash_entry_count,
305  join_column,
306  type_info,
307  count_matches_func,
308  fill_row_ids_func);
309 }
#define SUFFIX(name)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
int64_t bucket_normalization
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
size_t getNormalizedHashEntryCount() const
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_on_device_sharded ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info 
)

Definition at line 311 of file HashJoinRuntimeGpu.cu.

References checkCudaErrors, count_matches_sharded(), cuda_kernel_launch_wrapper(), fill_row_ids_sharded(), getQueryEngineCudaStream(), HashEntryInfo::hash_entry_count, inclusive_scan(), set_valid_pos(), set_valid_pos_flag(), and SUFFIX.

315  {
316  auto hash_entry_count = hash_entry_info.hash_entry_count;
317  int32_t* pos_buff = buff;
318  int32_t* count_buff = buff + hash_entry_count;
319  auto qe_cuda_stream = getQueryEngineCudaStream();
321  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
322  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
324  SUFFIX(count_matches_sharded), count_buff, join_column, type_info, shard_info);
325 
326  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
327 
328  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
330  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
331  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
333  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
334  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
336  buff,
337  hash_entry_count,
338  join_column,
339  type_info,
340  shard_info);
341 }
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define SUFFIX(name)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
size_t hash_entry_count
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)

+ Here is the call graph for this function:

void fill_one_to_many_hash_table_sharded_bucketized ( int32_t *  buff,
const HashEntryInfo  hash_entry_info,
const int32_t  invalid_slot_val,
const JoinColumn join_column,
const JoinColumnTypeInfo type_info,
const ShardInfo shard_info,
const int32_t *  sd_inner_to_outer_translation_map,
const int32_t  min_inner_elem,
const unsigned  cpu_thread_count 
)
ColumnType get_join_column_type_kind ( const SQLTypeInfo ti)
inline

Definition at line 156 of file HashJoinRuntime.h.

References SQLTypeInfo::is_date_in_days(), is_unsigned_type(), Signed, SmallDate, and Unsigned.

Referenced by OverlapsJoinHashTable::fetchColumnsForDevice(), PerfectJoinHashTable::fetchColumnsForDevice(), BaselineJoinHashTable::fetchColumnsForDevice(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

156  {
157  if (ti.is_date_in_days()) {
158  return SmallDate;
159  } else {
160  return is_unsigned_type(ti) ? Unsigned : Signed;
161  }
162 }
bool is_date_in_days() const
Definition: sqltypes.h:873
bool is_unsigned_type(const SQLTypeInfo &ti)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1702 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1708  {
1709  init_baseline_hash_join_buff<int32_t>(hash_join_buff,
1710  entry_count,
1711  key_component_count,
1712  with_val_slot,
1713  invalid_slot_val,
1714  cpu_thread_idx,
1715  cpu_thread_count);
1716 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1718 of file HashJoinRuntime.cpp.

Referenced by BaselineJoinHashTableBuilder::initHashTableOnCpu().

1724  {
1725  init_baseline_hash_join_buff<int64_t>(hash_join_buff,
1726  entry_count,
1727  key_component_count,
1728  with_val_slot,
1729  invalid_slot_val,
1730  cpu_thread_idx,
1731  cpu_thread_count);
1732 }

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_32 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 395 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

399  {
400  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
401  hash_join_buff,
402  entry_count,
403  key_component_count,
404  with_val_slot,
405  invalid_slot_val);
406 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_join_buff,
const int64_t  entry_count,
const size_t  key_component_count,
const bool  with_val_slot,
const int32_t  invalid_slot_val 
)

Definition at line 408 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

412  {
413  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
414  hash_join_buff,
415  entry_count,
416  key_component_count,
417  with_val_slot,
418  invalid_slot_val);
419 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void init_hash_join_buff ( int32_t *  buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 70 of file HashJoinRuntime.cpp.

Referenced by init_hash_join_buff_wrapper(), BaselineJoinHashTableBuilder::initHashTableOnCpu(), PerfectJoinHashTableBuilder::initOneToManyHashTableOnCpu(), and PerfectJoinHashTableBuilder::initOneToOneHashTableOnCpu().

74  {
75 #ifdef __CUDACC__
76  int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
77  int32_t step = blockDim.x * gridDim.x;
78 #else
79  int32_t start = cpu_thread_idx;
80  int32_t step = cpu_thread_count;
81 #endif
82  for (int64_t i = start; i < hash_entry_count; i += step) {
83  groups_buffer[i] = invalid_slot_val;
84  }
85 }

+ Here is the caller graph for this function:

void init_hash_join_buff_on_device ( int32_t *  buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val 
)

Definition at line 186 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper(), and init_hash_join_buff_wrapper().

Referenced by BaselineJoinHashTableBuilder::initHashTableOnGpu().

188  {
190  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
191 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1780 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1788  {
1789  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1790  entry_count,
1791  invalid_slot_val,
1792  false,
1793  key_component_count,
1794  with_val_slot,
1795  key_handler,
1796  num_elems,
1797  cpu_thread_idx,
1798  cpu_thread_count);
1799 }

+ Here is the caller graph for this function:

int overlaps_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1844 of file HashJoinRuntime.cpp.

1852  {
1853  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1854  entry_count,
1855  invalid_slot_val,
1856  false,
1857  key_component_count,
1858  with_val_slot,
1859  key_handler,
1860  num_elems,
1861  cpu_thread_idx,
1862  cpu_thread_count);
1863 }
void overlaps_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 488 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

496  {
498  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
499  hash_buff,
500  entry_count,
501  invalid_slot_val,
502  false,
503  key_component_count,
504  with_val_slot,
505  dev_err_buff,
506  key_handler,
507  num_elems);
508 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void overlaps_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const int64_t  hash_entry_count,
const OverlapsKeyHandler key_handler,
const int64_t  num_elems 
)

Definition at line 552 of file HashJoinRuntimeGpu.cu.

557  {
558  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
559  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
560 }
int range_fill_baseline_hash_join_buff_32 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1801 of file HashJoinRuntime.cpp.

Referenced by fill_baseline_hash_join_buff().

1809  {
1810  return fill_baseline_hash_join_buff<int32_t>(hash_buff,
1811  entry_count,
1812  invalid_slot_val,
1813  false,
1814  key_component_count,
1815  with_val_slot,
1816  key_handler,
1817  num_elems,
1818  cpu_thread_idx,
1819  cpu_thread_count);
1820 }

+ Here is the caller graph for this function:

int range_fill_baseline_hash_join_buff_64 ( int8_t *  hash_buff,
const size_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
const RangeKeyHandler key_handler,
const size_t  num_elems,
const int32_t  cpu_thread_idx,
const int32_t  cpu_thread_count 
)

Definition at line 1865 of file HashJoinRuntime.cpp.

1873  {
1874  return fill_baseline_hash_join_buff<int64_t>(hash_buff,
1875  entry_count,
1876  invalid_slot_val,
1877  false,
1878  key_component_count,
1879  with_val_slot,
1880  key_handler,
1881  num_elems,
1882  cpu_thread_idx,
1883  cpu_thread_count);
1884 }
void range_fill_baseline_hash_join_buff_on_device_64 ( int8_t *  hash_buff,
const int64_t  entry_count,
const int32_t  invalid_slot_val,
const size_t  key_component_count,
const bool  with_val_slot,
int *  dev_err_buff,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 510 of file HashJoinRuntimeGpu.cu.

References cuda_kernel_launch_wrapper().

517  {
519  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
520  hash_buff,
521  entry_count,
522  invalid_slot_val,
523  false,
524  key_component_count,
525  with_val_slot,
526  dev_err_buff,
527  key_handler,
528  num_elems);
529 }
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)

+ Here is the call graph for this function:

void range_fill_one_to_many_baseline_hash_table_on_device_64 ( int32_t *  buff,
const int64_t *  composite_key_dict,
const size_t  hash_entry_count,
const RangeKeyHandler key_handler,
const size_t  num_elems 
)

Definition at line 562 of file HashJoinRuntimeGpu.cu.

567  {
568  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
569  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
570 }

Variable Documentation