OmniSciDB  eb3a3d0a03
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntimeGpu.cu
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "HashJoinRuntime.cpp"
17 
18 #include <thrust/device_ptr.h>
19 #include <thrust/scan.h>
20 
21 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
22 
23 template <typename F, typename... ARGS>
24 void cuda_kernel_launch_wrapper(F func, ARGS&&... args) {
25  int grid_size = -1;
26  int block_size = -1;
27  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
28  func<<<grid_size, block_size>>>(std::forward<ARGS>(args)...);
29  checkCudaErrors(cudaGetLastError());
30 }
31 
32 __global__ void fill_hash_join_buff_wrapper(int32_t* buff,
33  const int32_t invalid_slot_val,
34  const bool for_semi_join,
35  const JoinColumn join_column,
36  const JoinColumnTypeInfo type_info,
37  int* err) {
38  int partial_err = SUFFIX(fill_hash_join_buff)(
39  buff, invalid_slot_val, for_semi_join, join_column, type_info, NULL, NULL, -1, -1);
40  atomicCAS(err, 0, partial_err);
41 }
42 
44  int32_t* buff,
45  const int32_t invalid_slot_val,
46  const bool for_semi_join,
47  const JoinColumn join_column,
48  const JoinColumnTypeInfo type_info,
49  int* err,
50  const int64_t bucket_normalization) {
51  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
52  invalid_slot_val,
53  for_semi_join,
54  join_column,
55  type_info,
56  NULL,
57  NULL,
58  -1,
59  -1,
60  bucket_normalization);
61  atomicCAS(err, 0, partial_err);
62 }
63 
65  const int32_t invalid_slot_val,
66  const bool for_semi_join,
67  int* dev_err_buff,
68  const JoinColumn join_column,
69  const JoinColumnTypeInfo type_info,
70  const int64_t bucket_normalization) {
72  buff,
73  invalid_slot_val,
74  for_semi_join,
75  join_column,
76  type_info,
77  dev_err_buff,
78  bucket_normalization);
79 }
80 
81 void fill_hash_join_buff_on_device(int32_t* buff,
82  const int32_t invalid_slot_val,
83  const bool for_semi_join,
84  int* dev_err_buff,
85  const JoinColumn join_column,
86  const JoinColumnTypeInfo type_info) {
88  buff,
89  invalid_slot_val,
90  for_semi_join,
91  join_column,
92  type_info,
93  dev_err_buff);
94 }
95 
97  int32_t* buff,
98  const int32_t invalid_slot_val,
99  const bool for_semi_join,
100  const JoinColumn join_column,
101  const JoinColumnTypeInfo type_info,
102  const ShardInfo shard_info,
103  int* err,
104  const int64_t bucket_normalization) {
105  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
106  invalid_slot_val,
107  for_semi_join,
108  join_column,
109  type_info,
110  shard_info,
111  NULL,
112  NULL,
113  -1,
114  -1,
115  bucket_normalization);
116  atomicCAS(err, 0, partial_err);
117 }
118 
119 __global__ void fill_hash_join_buff_wrapper_sharded(int32_t* buff,
120  const int32_t invalid_slot_val,
121  const bool for_semi_join,
122  const JoinColumn join_column,
123  const JoinColumnTypeInfo type_info,
124  const ShardInfo shard_info,
125  int* err) {
126  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(buff,
127  invalid_slot_val,
128  for_semi_join,
129  join_column,
130  type_info,
131  shard_info,
132  NULL,
133  NULL,
134  -1,
135  -1);
136  atomicCAS(err, 0, partial_err);
137 }
138 
140  int32_t* buff,
141  const int32_t invalid_slot_val,
142  const bool for_semi_join,
143  int* dev_err_buff,
144  const JoinColumn join_column,
145  const JoinColumnTypeInfo type_info,
146  const ShardInfo shard_info,
147  const int64_t bucket_normalization) {
149  buff,
150  invalid_slot_val,
151  for_semi_join,
152  join_column,
153  type_info,
154  shard_info,
155  dev_err_buff,
156  bucket_normalization);
157 }
158 
160  const int32_t invalid_slot_val,
161  const bool for_semi_join,
162  int* dev_err_buff,
163  const JoinColumn join_column,
164  const JoinColumnTypeInfo type_info,
165  const ShardInfo shard_info) {
167  buff,
168  invalid_slot_val,
169  for_semi_join,
170  join_column,
171  type_info,
172  shard_info,
173  dev_err_buff);
174 }
175 
176 __global__ void init_hash_join_buff_wrapper(int32_t* buff,
177  const int64_t hash_entry_count,
178  const int32_t invalid_slot_val) {
179  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
180 }
181 
182 void init_hash_join_buff_on_device(int32_t* buff,
183  const int64_t hash_entry_count,
184  const int32_t invalid_slot_val) {
186  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
187 }
188 
189 #define VALID_POS_FLAG 0
190 
191 __global__ void set_valid_pos_flag(int32_t* pos_buff,
192  const int32_t* count_buff,
193  const int64_t entry_count) {
194  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
195  const int32_t step = blockDim.x * gridDim.x;
196  for (int64_t i = start; i < entry_count; i += step) {
197  if (count_buff[i]) {
198  pos_buff[i] = VALID_POS_FLAG;
199  }
200  }
201 }
202 
203 __global__ void set_valid_pos(int32_t* pos_buff,
204  int32_t* count_buff,
205  const int64_t entry_count) {
206  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
207  const int32_t step = blockDim.x * gridDim.x;
208  for (int64_t i = start; i < entry_count; i += step) {
209  if (VALID_POS_FLAG == pos_buff[i]) {
210  pos_buff[i] = !i ? 0 : count_buff[i - 1];
211  }
212  }
213 }
214 
215 template <typename COUNT_MATCHES_FUNCTOR, typename FILL_ROW_IDS_FUNCTOR>
217  const int64_t hash_entry_count,
218  const int32_t invalid_slot_val,
219  const JoinColumn& join_column,
220  const JoinColumnTypeInfo& type_info,
221  COUNT_MATCHES_FUNCTOR count_matches_func,
222  FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {
223  int32_t* pos_buff = buff;
224  int32_t* count_buff = buff + hash_entry_count;
225  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
226  count_matches_func();
227 
228  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
229 
230  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
232  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
233 
234  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
235  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
236  fill_row_ids_func();
237 }
238 
240  const HashEntryInfo hash_entry_info,
241  const int32_t invalid_slot_val,
242  const JoinColumn& join_column,
243  const JoinColumnTypeInfo& type_info) {
244  auto hash_entry_count = hash_entry_info.hash_entry_count;
245  auto count_matches_func = [hash_entry_count,
246  count_buff = buff + hash_entry_count,
247  invalid_slot_val,
248  join_column,
249  type_info] {
251  SUFFIX(count_matches), count_buff, invalid_slot_val, join_column, type_info);
252  };
253 
254  auto fill_row_ids_func =
255  [buff, hash_entry_count, invalid_slot_val, join_column, type_info] {
257  buff,
258  hash_entry_count,
259  invalid_slot_val,
260  join_column,
261  type_info);
262  };
263 
265  hash_entry_count,
266  invalid_slot_val,
267  join_column,
268  type_info,
269  count_matches_func,
270  fill_row_ids_func);
271 }
272 
274  int32_t* buff,
275  const HashEntryInfo hash_entry_info,
276  const int32_t invalid_slot_val,
277  const JoinColumn& join_column,
278  const JoinColumnTypeInfo& type_info) {
279  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
280  auto count_matches_func = [count_buff = buff + hash_entry_count,
281  invalid_slot_val,
282  join_column,
283  type_info,
284  bucket_normalization =
285  hash_entry_info.bucket_normalization] {
287  count_buff,
288  invalid_slot_val,
289  join_column,
290  type_info,
291  bucket_normalization);
292  };
293 
294  auto fill_row_ids_func = [buff,
295  hash_entry_count =
296  hash_entry_info.getNormalizedHashEntryCount(),
297  invalid_slot_val,
298  join_column,
299  type_info,
300  bucket_normalization = hash_entry_info.bucket_normalization] {
302  buff,
303  hash_entry_count,
304  invalid_slot_val,
305  join_column,
306  type_info,
307  bucket_normalization);
308  };
309 
311  hash_entry_count,
312  invalid_slot_val,
313  join_column,
314  type_info,
315  count_matches_func,
316  fill_row_ids_func);
317 }
318 
320  const HashEntryInfo hash_entry_info,
321  const int32_t invalid_slot_val,
322  const JoinColumn& join_column,
323  const JoinColumnTypeInfo& type_info,
324  const ShardInfo& shard_info) {
325  auto hash_entry_count = hash_entry_info.hash_entry_count;
326  int32_t* pos_buff = buff;
327  int32_t* count_buff = buff + hash_entry_count;
328  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
330  count_buff,
331  invalid_slot_val,
332  join_column,
333  type_info,
334  shard_info);
335 
336  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
337 
338  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
340  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
341  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
342  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
344  buff,
345  hash_entry_count,
346  invalid_slot_val,
347  join_column,
348  type_info,
349  shard_info);
350 }
351 
352 template <typename T, typename KEY_HANDLER>
354  const T* composite_key_dict,
355  const int64_t hash_entry_count,
356  const int32_t invalid_slot_val,
357  const KEY_HANDLER* key_handler,
358  const size_t num_elems) {
359  auto pos_buff = buff;
360  auto count_buff = buff + hash_entry_count;
361  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
362  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
363  count_buff,
364  composite_key_dict,
365  hash_entry_count,
366  key_handler,
367  num_elems);
368 
369  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
370 
371  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
373  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
374  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
375  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
376 
377  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
378  buff,
379  composite_key_dict,
380  hash_entry_count,
381  invalid_slot_val,
382  key_handler,
383  num_elems);
384 }
385 
386 template <typename T>
387 __global__ void init_baseline_hash_join_buff_wrapper(int8_t* hash_join_buff,
388  const int64_t entry_count,
389  const size_t key_component_count,
390  const bool with_val_slot,
391  const int32_t invalid_slot_val) {
392  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
393  entry_count,
394  key_component_count,
395  with_val_slot,
396  invalid_slot_val,
397  -1,
398  -1);
399 }
400 
401 void init_baseline_hash_join_buff_on_device_32(int8_t* hash_join_buff,
402  const int64_t entry_count,
403  const size_t key_component_count,
404  const bool with_val_slot,
405  const int32_t invalid_slot_val) {
406  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
407  hash_join_buff,
408  entry_count,
409  key_component_count,
410  with_val_slot,
411  invalid_slot_val);
412 }
413 
414 void init_baseline_hash_join_buff_on_device_64(int8_t* hash_join_buff,
415  const int64_t entry_count,
416  const size_t key_component_count,
417  const bool with_val_slot,
418  const int32_t invalid_slot_val) {
419  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
420  hash_join_buff,
421  entry_count,
422  key_component_count,
423  with_val_slot,
424  invalid_slot_val);
425 }
426 
427 template <typename T, typename KEY_HANDLER>
428 __global__ void fill_baseline_hash_join_buff_wrapper(int8_t* hash_buff,
429  const int64_t entry_count,
430  const int32_t invalid_slot_val,
431  const bool for_semi_join,
432  const size_t key_component_count,
433  const bool with_val_slot,
434  int* err,
435  const KEY_HANDLER* key_handler,
436  const int64_t num_elems) {
437  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
438  entry_count,
439  invalid_slot_val,
440  for_semi_join,
441  key_component_count,
442  with_val_slot,
443  key_handler,
444  num_elems,
445  -1,
446  -1);
447  atomicCAS(err, 0, partial_err);
448 }
449 
451  const int64_t entry_count,
452  const int32_t invalid_slot_val,
453  const bool for_semi_join,
454  const size_t key_component_count,
455  const bool with_val_slot,
456  int* dev_err_buff,
457  const GenericKeyHandler* key_handler,
458  const int64_t num_elems) {
460  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
461  hash_buff,
462  entry_count,
463  invalid_slot_val,
464  for_semi_join,
465  key_component_count,
466  with_val_slot,
467  dev_err_buff,
468  key_handler,
469  num_elems);
470 }
471 
473  const int64_t entry_count,
474  const int32_t invalid_slot_val,
475  const bool for_semi_join,
476  const size_t key_component_count,
477  const bool with_val_slot,
478  int* dev_err_buff,
479  const GenericKeyHandler* key_handler,
480  const int64_t num_elems) {
482  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
483  hash_buff,
484  entry_count,
485  invalid_slot_val,
486  for_semi_join,
487  key_component_count,
488  with_val_slot,
489  dev_err_buff,
490  key_handler,
491  num_elems);
492 }
493 
495  int8_t* hash_buff,
496  const int64_t entry_count,
497  const int32_t invalid_slot_val,
498  const size_t key_component_count,
499  const bool with_val_slot,
500  int* dev_err_buff,
501  const OverlapsKeyHandler* key_handler,
502  const int64_t num_elems) {
504  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
505  hash_buff,
506  entry_count,
507  invalid_slot_val,
508  false,
509  key_component_count,
510  with_val_slot,
511  dev_err_buff,
512  key_handler,
513  num_elems);
514 }
515 
517  const int64_t entry_count,
518  const int32_t invalid_slot_val,
519  const size_t key_component_count,
520  const bool with_val_slot,
521  int* dev_err_buff,
522  const RangeKeyHandler* key_handler,
523  const size_t num_elems) {
525  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
526  hash_buff,
527  entry_count,
528  invalid_slot_val,
529  false,
530  key_component_count,
531  with_val_slot,
532  dev_err_buff,
533  key_handler,
534  num_elems);
535 }
536 
538  int32_t* buff,
539  const int32_t* composite_key_dict,
540  const int64_t hash_entry_count,
541  const int32_t invalid_slot_val,
542  const size_t key_component_count,
543  const GenericKeyHandler* key_handler,
544  const int64_t num_elems) {
545  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
546  composite_key_dict,
547  hash_entry_count,
548  invalid_slot_val,
549  key_handler,
550  num_elems);
551 }
552 
554  int32_t* buff,
555  const int64_t* composite_key_dict,
556  const int64_t hash_entry_count,
557  const int32_t invalid_slot_val,
558  const GenericKeyHandler* key_handler,
559  const int64_t num_elems) {
560  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
561  composite_key_dict,
562  hash_entry_count,
563  invalid_slot_val,
564  key_handler,
565  num_elems);
566 }
567 
569  int32_t* buff,
570  const int64_t* composite_key_dict,
571  const int64_t hash_entry_count,
572  const int32_t invalid_slot_val,
573  const OverlapsKeyHandler* key_handler,
574  const int64_t num_elems) {
575  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
576  composite_key_dict,
577  hash_entry_count,
578  invalid_slot_val,
579  key_handler,
580  num_elems);
581 }
582 
584  int32_t* buff,
585  const int64_t* composite_key_dict,
586  const size_t hash_entry_count,
587  const int32_t invalid_slot_val,
588  const RangeKeyHandler* key_handler,
589  const size_t num_elems) {
590  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
591  composite_key_dict,
592  hash_entry_count,
593  invalid_slot_val,
594  key_handler,
595  num_elems);
596 }
597 
599  const uint32_t b,
600  int32_t* row_counts_buffer,
601  const OverlapsKeyHandler* key_handler,
602  const int64_t num_elems) {
603  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
604  hll_buffer,
605  row_counts_buffer,
606  b,
607  num_elems,
608  key_handler);
609 
610  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
612  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
613 }
614 
616  const uint32_t b,
617  int32_t* row_counts_buffer,
618  const RangeKeyHandler* key_handler,
619  const size_t num_elems,
620  const size_t block_size_x,
621  const size_t grid_size_x) {
622  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x>>>(
623  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
624 
625  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
627  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
628 }
629 
630 void approximate_distinct_tuples_on_device(uint8_t* hll_buffer,
631  const uint32_t b,
632  const GenericKeyHandler* key_handler,
633  const int64_t num_elems) {
634  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
635  hll_buffer,
636  nullptr,
637  b,
638  num_elems,
639  key_handler);
640 }
641 
642 void compute_bucket_sizes_on_device(double* bucket_sizes_buffer,
643  const JoinColumn* join_column,
644  const JoinColumnTypeInfo* type_info,
645  const double* bucket_sz_threshold) {
646  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
647  bucket_sizes_buffer,
648  join_column,
649  type_info,
650  bucket_sz_threshold);
651 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
void fill_hash_join_buff_on_device_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
void fill_one_to_many_baseline_hash_table_on_device(int32_t *buff, const SIZE *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const KEY_HANDLER *key_handler, const size_t num_elems)
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_hash_join_buff_on_device_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
#define checkCudaErrors(err)
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
#define SUFFIX(name)
void range_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_on_device_sharded(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
void overlaps_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
void compute_bucket_sizes_on_device(double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void approximate_distinct_tuples_on_device_range(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
#define VALID_POS_FLAG
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const int32_t invalid_slot_val, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_one_to_many_hash_table_on_device(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
int64_t bucket_normalization
size_t hash_entry_count
void init_hash_join_buff_on_device(int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void range_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const RangeKeyHandler *key_handler, const size_t num_elems)
__global__ void init_baseline_hash_join_buff_wrapper(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
size_t getNormalizedHashEntryCount() const
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_hash_join_buff_on_device(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
void fill_one_to_many_hash_table_on_device_bucketized(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
void approximate_distinct_tuples_on_device_overlaps(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
void fill_hash_join_buff_on_device_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)