OmniSciDB  fe05a0c208
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HashJoinRuntimeGpu.cu
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "HashJoinRuntime.cpp"
17 
18 #include <thrust/device_ptr.h>
19 #include <thrust/scan.h>
20 
21 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
22 
23 template <typename F, typename... ARGS>
24 void cuda_kernel_launch_wrapper(F func, ARGS&&... args) {
25  int grid_size = -1;
26  int block_size = -1;
27  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
28  func<<<grid_size, block_size>>>(std::forward<ARGS>(args)...);
29  checkCudaErrors(cudaGetLastError());
30 }
31 
32 __global__ void fill_hash_join_buff_wrapper(int32_t* buff,
33  const int32_t invalid_slot_val,
34  const JoinColumn join_column,
35  const JoinColumnTypeInfo type_info,
36  int* err) {
37  int partial_err = SUFFIX(fill_hash_join_buff)(
38  buff, invalid_slot_val, join_column, type_info, NULL, NULL, -1, -1);
39  atomicCAS(err, 0, partial_err);
40 }
41 
43  int32_t* buff,
44  const int32_t invalid_slot_val,
45  const JoinColumn join_column,
46  const JoinColumnTypeInfo type_info,
47  int* err,
48  const int64_t bucket_normalization) {
49  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
50  invalid_slot_val,
51  join_column,
52  type_info,
53  NULL,
54  NULL,
55  -1,
56  -1,
57  bucket_normalization);
58  atomicCAS(err, 0, partial_err);
59 }
60 
62  const int32_t invalid_slot_val,
63  int* dev_err_buff,
64  const JoinColumn join_column,
65  const JoinColumnTypeInfo type_info,
66  const int64_t bucket_normalization) {
68  buff,
69  invalid_slot_val,
70  join_column,
71  type_info,
72  dev_err_buff,
73  bucket_normalization);
74 }
75 
76 void fill_hash_join_buff_on_device(int32_t* buff,
77  const int32_t invalid_slot_val,
78  int* dev_err_buff,
79  const JoinColumn join_column,
80  const JoinColumnTypeInfo type_info) {
82  buff,
83  invalid_slot_val,
84  join_column,
85  type_info,
86  dev_err_buff);
87 }
88 
90  int32_t* buff,
91  const int32_t invalid_slot_val,
92  const JoinColumn join_column,
93  const JoinColumnTypeInfo type_info,
94  const ShardInfo shard_info,
95  int* err,
96  const int64_t bucket_normalization) {
97  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
98  invalid_slot_val,
99  join_column,
100  type_info,
101  shard_info,
102  NULL,
103  NULL,
104  -1,
105  -1,
106  bucket_normalization);
107  atomicCAS(err, 0, partial_err);
108 }
109 
110 __global__ void fill_hash_join_buff_wrapper_sharded(int32_t* buff,
111  const int32_t invalid_slot_val,
112  const JoinColumn join_column,
113  const JoinColumnTypeInfo type_info,
114  const ShardInfo shard_info,
115  int* err) {
116  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(
117  buff, invalid_slot_val, join_column, type_info, shard_info, NULL, NULL, -1, -1);
118  atomicCAS(err, 0, partial_err);
119 }
120 
122  int32_t* buff,
123  const int32_t invalid_slot_val,
124  int* dev_err_buff,
125  const JoinColumn join_column,
126  const JoinColumnTypeInfo type_info,
127  const ShardInfo shard_info,
128  const int64_t bucket_normalization) {
130  buff,
131  invalid_slot_val,
132  join_column,
133  type_info,
134  shard_info,
135  dev_err_buff,
136  bucket_normalization);
137 }
138 
140  const int32_t invalid_slot_val,
141  int* dev_err_buff,
142  const JoinColumn join_column,
143  const JoinColumnTypeInfo type_info,
144  const ShardInfo shard_info) {
146  buff,
147  invalid_slot_val,
148  join_column,
149  type_info,
150  shard_info,
151  dev_err_buff);
152 }
153 
154 __global__ void init_hash_join_buff_wrapper(int32_t* buff,
155  const int64_t hash_entry_count,
156  const int32_t invalid_slot_val) {
157  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
158 }
159 
160 void init_hash_join_buff_on_device(int32_t* buff,
161  const int64_t hash_entry_count,
162  const int32_t invalid_slot_val) {
164  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
165 }
166 
167 #define VALID_POS_FLAG 0
168 
169 __global__ void set_valid_pos_flag(int32_t* pos_buff,
170  const int32_t* count_buff,
171  const int64_t entry_count) {
172  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
173  const int32_t step = blockDim.x * gridDim.x;
174  for (int64_t i = start; i < entry_count; i += step) {
175  if (count_buff[i]) {
176  pos_buff[i] = VALID_POS_FLAG;
177  }
178  }
179 }
180 
181 __global__ void set_valid_pos(int32_t* pos_buff,
182  int32_t* count_buff,
183  const int64_t entry_count) {
184  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
185  const int32_t step = blockDim.x * gridDim.x;
186  for (int64_t i = start; i < entry_count; i += step) {
187  if (VALID_POS_FLAG == pos_buff[i]) {
188  pos_buff[i] = !i ? 0 : count_buff[i - 1];
189  }
190  }
191 }
192 
193 template <typename COUNT_MATCHES_FUNCTOR, typename FILL_ROW_IDS_FUNCTOR>
195  const int64_t hash_entry_count,
196  const int32_t invalid_slot_val,
197  const JoinColumn& join_column,
198  const JoinColumnTypeInfo& type_info,
199  COUNT_MATCHES_FUNCTOR count_matches_func,
200  FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {
201  int32_t* pos_buff = buff;
202  int32_t* count_buff = buff + hash_entry_count;
203  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
204  count_matches_func();
205 
206  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
207 
208  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
210  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
211 
212  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
213  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
214  fill_row_ids_func();
215 }
216 
218  const HashEntryInfo hash_entry_info,
219  const int32_t invalid_slot_val,
220  const JoinColumn& join_column,
221  const JoinColumnTypeInfo& type_info) {
222  auto hash_entry_count = hash_entry_info.hash_entry_count;
223  auto count_matches_func = [hash_entry_count,
224  count_buff = buff + hash_entry_count,
225  invalid_slot_val,
226  join_column,
227  type_info] {
229  SUFFIX(count_matches), count_buff, invalid_slot_val, join_column, type_info);
230  };
231 
232  auto fill_row_ids_func =
233  [buff, hash_entry_count, invalid_slot_val, join_column, type_info] {
235  buff,
236  hash_entry_count,
237  invalid_slot_val,
238  join_column,
239  type_info);
240  };
241 
243  hash_entry_count,
244  invalid_slot_val,
245  join_column,
246  type_info,
247  count_matches_func,
248  fill_row_ids_func);
249 }
250 
252  int32_t* buff,
253  const HashEntryInfo hash_entry_info,
254  const int32_t invalid_slot_val,
255  const JoinColumn& join_column,
256  const JoinColumnTypeInfo& type_info) {
257  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
258  auto count_matches_func = [count_buff = buff + hash_entry_count,
259  invalid_slot_val,
260  join_column,
261  type_info,
262  bucket_normalization =
263  hash_entry_info.bucket_normalization] {
265  count_buff,
266  invalid_slot_val,
267  join_column,
268  type_info,
269  bucket_normalization);
270  };
271 
272  auto fill_row_ids_func = [buff,
273  hash_entry_count =
274  hash_entry_info.getNormalizedHashEntryCount(),
275  invalid_slot_val,
276  join_column,
277  type_info,
278  bucket_normalization = hash_entry_info.bucket_normalization] {
280  buff,
281  hash_entry_count,
282  invalid_slot_val,
283  join_column,
284  type_info,
285  bucket_normalization);
286  };
287 
289  hash_entry_count,
290  invalid_slot_val,
291  join_column,
292  type_info,
293  count_matches_func,
294  fill_row_ids_func);
295 }
296 
298  const HashEntryInfo hash_entry_info,
299  const int32_t invalid_slot_val,
300  const JoinColumn& join_column,
301  const JoinColumnTypeInfo& type_info,
302  const ShardInfo& shard_info) {
303  auto hash_entry_count = hash_entry_info.hash_entry_count;
304  int32_t* pos_buff = buff;
305  int32_t* count_buff = buff + hash_entry_count;
306  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
308  count_buff,
309  invalid_slot_val,
310  join_column,
311  type_info,
312  shard_info);
313 
314  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
315 
316  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
318  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
319  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
320  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
322  buff,
323  hash_entry_count,
324  invalid_slot_val,
325  join_column,
326  type_info,
327  shard_info);
328 }
329 
330 template <typename T, typename KEY_HANDLER>
332  const T* composite_key_dict,
333  const int64_t hash_entry_count,
334  const int32_t invalid_slot_val,
335  const KEY_HANDLER* key_handler,
336  const size_t num_elems) {
337  auto pos_buff = buff;
338  auto count_buff = buff + hash_entry_count;
339  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
340  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
341  count_buff,
342  composite_key_dict,
343  hash_entry_count,
344  key_handler,
345  num_elems);
346 
347  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
348 
349  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
351  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
352  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
353  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
354 
355  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
356  buff,
357  composite_key_dict,
358  hash_entry_count,
359  invalid_slot_val,
360  key_handler,
361  num_elems);
362 }
363 
364 template <typename T>
365 __global__ void init_baseline_hash_join_buff_wrapper(int8_t* hash_join_buff,
366  const int64_t entry_count,
367  const size_t key_component_count,
368  const bool with_val_slot,
369  const int32_t invalid_slot_val) {
370  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
371  entry_count,
372  key_component_count,
373  with_val_slot,
374  invalid_slot_val,
375  -1,
376  -1);
377 }
378 
379 void init_baseline_hash_join_buff_on_device_32(int8_t* hash_join_buff,
380  const int64_t entry_count,
381  const size_t key_component_count,
382  const bool with_val_slot,
383  const int32_t invalid_slot_val) {
384  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
385  hash_join_buff,
386  entry_count,
387  key_component_count,
388  with_val_slot,
389  invalid_slot_val);
390 }
391 
392 void init_baseline_hash_join_buff_on_device_64(int8_t* hash_join_buff,
393  const int64_t entry_count,
394  const size_t key_component_count,
395  const bool with_val_slot,
396  const int32_t invalid_slot_val) {
397  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
398  hash_join_buff,
399  entry_count,
400  key_component_count,
401  with_val_slot,
402  invalid_slot_val);
403 }
404 
405 template <typename T, typename KEY_HANDLER>
406 __global__ void fill_baseline_hash_join_buff_wrapper(int8_t* hash_buff,
407  const int64_t entry_count,
408  const int32_t invalid_slot_val,
409  const size_t key_component_count,
410  const bool with_val_slot,
411  int* err,
412  const KEY_HANDLER* key_handler,
413  const int64_t num_elems) {
414  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
415  entry_count,
416  invalid_slot_val,
417  key_component_count,
418  with_val_slot,
419  key_handler,
420  num_elems,
421  -1,
422  -1);
423  atomicCAS(err, 0, partial_err);
424 }
425 
427  const int64_t entry_count,
428  const int32_t invalid_slot_val,
429  const size_t key_component_count,
430  const bool with_val_slot,
431  int* dev_err_buff,
432  const GenericKeyHandler* key_handler,
433  const int64_t num_elems) {
435  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
436  hash_buff,
437  entry_count,
438  invalid_slot_val,
439  key_component_count,
440  with_val_slot,
441  dev_err_buff,
442  key_handler,
443  num_elems);
444 }
445 
447  const int64_t entry_count,
448  const int32_t invalid_slot_val,
449  const size_t key_component_count,
450  const bool with_val_slot,
451  int* dev_err_buff,
452  const GenericKeyHandler* key_handler,
453  const int64_t num_elems) {
455  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
456  hash_buff,
457  entry_count,
458  invalid_slot_val,
459  key_component_count,
460  with_val_slot,
461  dev_err_buff,
462  key_handler,
463  num_elems);
464 }
465 
467  int8_t* hash_buff,
468  const int64_t entry_count,
469  const int32_t invalid_slot_val,
470  const size_t key_component_count,
471  const bool with_val_slot,
472  int* dev_err_buff,
473  const OverlapsKeyHandler* key_handler,
474  const int64_t num_elems) {
476  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
477  hash_buff,
478  entry_count,
479  invalid_slot_val,
480  key_component_count,
481  with_val_slot,
482  dev_err_buff,
483  key_handler,
484  num_elems);
485 }
486 
488  int32_t* buff,
489  const int32_t* composite_key_dict,
490  const int64_t hash_entry_count,
491  const int32_t invalid_slot_val,
492  const size_t key_component_count,
493  const GenericKeyHandler* key_handler,
494  const int64_t num_elems) {
495  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
496  composite_key_dict,
497  hash_entry_count,
498  invalid_slot_val,
499  key_handler,
500  num_elems);
501 }
502 
504  int32_t* buff,
505  const int64_t* composite_key_dict,
506  const int64_t hash_entry_count,
507  const int32_t invalid_slot_val,
508  const GenericKeyHandler* key_handler,
509  const int64_t num_elems) {
510  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
511  composite_key_dict,
512  hash_entry_count,
513  invalid_slot_val,
514  key_handler,
515  num_elems);
516 }
517 
519  int32_t* buff,
520  const int64_t* composite_key_dict,
521  const int64_t hash_entry_count,
522  const int32_t invalid_slot_val,
523  const OverlapsKeyHandler* key_handler,
524  const int64_t num_elems) {
525  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
526  composite_key_dict,
527  hash_entry_count,
528  invalid_slot_val,
529  key_handler,
530  num_elems);
531 }
532 
534  const uint32_t b,
535  int32_t* row_counts_buffer,
536  const OverlapsKeyHandler* key_handler,
537  const int64_t num_elems) {
538  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
539  hll_buffer,
540  row_counts_buffer,
541  b,
542  num_elems,
543  key_handler);
544 
545  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
547  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
548 }
549 
550 void approximate_distinct_tuples_on_device(uint8_t* hll_buffer,
551  const uint32_t b,
552  const GenericKeyHandler* key_handler,
553  const int64_t num_elems) {
554  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
555  hll_buffer,
556  nullptr,
557  b,
558  num_elems,
559  key_handler);
560 }
561 
562 void compute_bucket_sizes_on_device(double* bucket_sizes_buffer,
563  const JoinColumn* join_column,
564  const JoinColumnTypeInfo* type_info,
565  const double* bucket_sz_threshold) {
566  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
567  bucket_sizes_buffer,
568  join_column,
569  type_info,
570  bucket_sz_threshold);
571 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_hash_join_buff_on_device_sharded(int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_baseline_hash_table_on_device(int32_t *buff, const SIZE *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const KEY_HANDLER *key_handler, const size_t num_elems)
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
#define checkCudaErrors(err)
#define SUFFIX(name)
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void fill_one_to_many_hash_table_on_device_sharded(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void overlaps_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
void fill_hash_join_buff_on_device_bucketized(int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
void compute_bucket_sizes_on_device(double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void fill_hash_join_buff_on_device(int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
#define VALID_POS_FLAG
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_one_to_many_hash_table_on_device(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
int64_t bucket_normalization
size_t hash_entry_count
void init_hash_join_buff_on_device(int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
__global__ void init_baseline_hash_join_buff_wrapper(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
size_t getNormalizedHashEntryCount() const
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void fill_hash_join_buff_on_device_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_on_device_bucketized(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void approximate_distinct_tuples_on_device_overlaps(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)