OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntimeGpu.cu
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "HashJoinRuntime.cpp"
17 
18 #include <cuda.h>
20 
21 #include <thrust/device_ptr.h>
22 #include <thrust/scan.h>
23 
24 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
25 
26 template <typename F, typename... ARGS>
27 void cuda_kernel_launch_wrapper(F func, ARGS&&... args) {
28  int grid_size = -1;
29  int block_size = -1;
30  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
31  auto qe_cuda_stream = getQueryEngineCudaStream();
32  func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(args)...);
33  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
34 }
35 
36 __global__ void fill_hash_join_buff_wrapper(int32_t* buff,
37  const int32_t invalid_slot_val,
38  const bool for_semi_join,
39  const JoinColumn join_column,
40  const JoinColumnTypeInfo type_info,
41  int* err) {
42  int partial_err = SUFFIX(fill_hash_join_buff)(
43  buff, invalid_slot_val, for_semi_join, join_column, type_info, NULL, NULL, -1, -1);
44  atomicCAS(err, 0, partial_err);
45 }
46 
48  int32_t* buff,
49  const int32_t invalid_slot_val,
50  const bool for_semi_join,
51  const JoinColumn join_column,
52  const JoinColumnTypeInfo type_info,
53  int* err,
54  const int64_t bucket_normalization) {
55  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
56  invalid_slot_val,
57  for_semi_join,
58  join_column,
59  type_info,
60  NULL,
61  NULL,
62  -1,
63  -1,
64  bucket_normalization);
65  atomicCAS(err, 0, partial_err);
66 }
67 
69  const int32_t invalid_slot_val,
70  const bool for_semi_join,
71  int* dev_err_buff,
72  const JoinColumn join_column,
73  const JoinColumnTypeInfo type_info,
74  const int64_t bucket_normalization) {
76  buff,
77  invalid_slot_val,
78  for_semi_join,
79  join_column,
80  type_info,
81  dev_err_buff,
82  bucket_normalization);
83 }
84 
85 void fill_hash_join_buff_on_device(int32_t* buff,
86  const int32_t invalid_slot_val,
87  const bool for_semi_join,
88  int* dev_err_buff,
89  const JoinColumn join_column,
90  const JoinColumnTypeInfo type_info) {
92  buff,
93  invalid_slot_val,
94  for_semi_join,
95  join_column,
96  type_info,
97  dev_err_buff);
98 }
99 
101  int32_t* buff,
102  const int32_t invalid_slot_val,
103  const bool for_semi_join,
104  const JoinColumn join_column,
105  const JoinColumnTypeInfo type_info,
106  const ShardInfo shard_info,
107  int* err,
108  const int64_t bucket_normalization) {
109  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
110  invalid_slot_val,
111  for_semi_join,
112  join_column,
113  type_info,
114  shard_info,
115  NULL,
116  NULL,
117  -1,
118  -1,
119  bucket_normalization);
120  atomicCAS(err, 0, partial_err);
121 }
122 
123 __global__ void fill_hash_join_buff_wrapper_sharded(int32_t* buff,
124  const int32_t invalid_slot_val,
125  const bool for_semi_join,
126  const JoinColumn join_column,
127  const JoinColumnTypeInfo type_info,
128  const ShardInfo shard_info,
129  int* err) {
130  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(buff,
131  invalid_slot_val,
132  for_semi_join,
133  join_column,
134  type_info,
135  shard_info,
136  NULL,
137  NULL,
138  -1,
139  -1);
140  atomicCAS(err, 0, partial_err);
141 }
142 
144  int32_t* buff,
145  const int32_t invalid_slot_val,
146  const bool for_semi_join,
147  int* dev_err_buff,
148  const JoinColumn join_column,
149  const JoinColumnTypeInfo type_info,
150  const ShardInfo shard_info,
151  const int64_t bucket_normalization) {
153  buff,
154  invalid_slot_val,
155  for_semi_join,
156  join_column,
157  type_info,
158  shard_info,
159  dev_err_buff,
160  bucket_normalization);
161 }
162 
164  const int32_t invalid_slot_val,
165  const bool for_semi_join,
166  int* dev_err_buff,
167  const JoinColumn join_column,
168  const JoinColumnTypeInfo type_info,
169  const ShardInfo shard_info) {
171  buff,
172  invalid_slot_val,
173  for_semi_join,
174  join_column,
175  type_info,
176  shard_info,
177  dev_err_buff);
178 }
179 
180 __global__ void init_hash_join_buff_wrapper(int32_t* buff,
181  const int64_t hash_entry_count,
182  const int32_t invalid_slot_val) {
183  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
184 }
185 
186 void init_hash_join_buff_on_device(int32_t* buff,
187  const int64_t hash_entry_count,
188  const int32_t invalid_slot_val) {
190  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
191 }
192 
193 #define VALID_POS_FLAG 0
194 
195 __global__ void set_valid_pos_flag(int32_t* pos_buff,
196  const int32_t* count_buff,
197  const int64_t entry_count) {
198  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
199  const int32_t step = blockDim.x * gridDim.x;
200  for (int64_t i = start; i < entry_count; i += step) {
201  if (count_buff[i]) {
202  pos_buff[i] = VALID_POS_FLAG;
203  }
204  }
205 }
206 
207 __global__ void set_valid_pos(int32_t* pos_buff,
208  int32_t* count_buff,
209  const int64_t entry_count) {
210  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
211  const int32_t step = blockDim.x * gridDim.x;
212  for (int64_t i = start; i < entry_count; i += step) {
213  if (VALID_POS_FLAG == pos_buff[i]) {
214  pos_buff[i] = !i ? 0 : count_buff[i - 1];
215  }
216  }
217 }
218 
219 template <typename COUNT_MATCHES_FUNCTOR, typename FILL_ROW_IDS_FUNCTOR>
221  const int64_t hash_entry_count,
222  const JoinColumn& join_column,
223  const JoinColumnTypeInfo& type_info,
224  COUNT_MATCHES_FUNCTOR count_matches_func,
225  FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {
226  int32_t* pos_buff = buff;
227  int32_t* count_buff = buff + hash_entry_count;
228  auto qe_cuda_stream = getQueryEngineCudaStream();
230  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
231  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
232  count_matches_func();
233 
234  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
235 
236  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
238  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
239 
240  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
242  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
243  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
244  fill_row_ids_func();
245 }
246 
248  const BucketizedHashEntryInfo hash_entry_info,
249  const JoinColumn& join_column,
250  const JoinColumnTypeInfo& type_info,
251  const bool for_window_framing) {
252  auto hash_entry_count = hash_entry_info.bucketized_hash_entry_count;
253  auto count_matches_func = [count_buff = buff + hash_entry_count,
254  join_column,
255  type_info] {
256  cuda_kernel_launch_wrapper(SUFFIX(count_matches), count_buff, join_column, type_info);
257  };
258 
259  auto fill_row_ids_func =
260  [buff, hash_entry_count, join_column, type_info, for_window_framing] {
262  buff,
263  hash_entry_count,
264  join_column,
265  type_info,
266  for_window_framing);
267  };
268 
270  hash_entry_count,
271  join_column,
272  type_info,
273  count_matches_func,
274  fill_row_ids_func);
275 }
276 
278  int32_t* buff,
279  const BucketizedHashEntryInfo hash_entry_info,
280  const JoinColumn& join_column,
281  const JoinColumnTypeInfo& type_info) {
282  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
283  auto count_matches_func = [count_buff = buff + hash_entry_count,
284  join_column,
285  type_info,
286  bucket_normalization =
287  hash_entry_info.bucket_normalization] {
289  count_buff,
290  join_column,
291  type_info,
292  bucket_normalization);
293  };
294 
295  auto fill_row_ids_func = [buff,
296  hash_entry_count =
297  hash_entry_info.getNormalizedHashEntryCount(),
298  join_column,
299  type_info,
300  bucket_normalization = hash_entry_info.bucket_normalization] {
302  buff,
303  hash_entry_count,
304  join_column,
305  type_info,
306  bucket_normalization);
307  };
308 
310  hash_entry_count,
311  join_column,
312  type_info,
313  count_matches_func,
314  fill_row_ids_func);
315 }
316 
318  int32_t* buff,
319  const BucketizedHashEntryInfo hash_entry_info,
320  const JoinColumn& join_column,
321  const JoinColumnTypeInfo& type_info,
322  const ShardInfo& shard_info) {
323  auto hash_entry_count = hash_entry_info.bucketized_hash_entry_count;
324  int32_t* pos_buff = buff;
325  int32_t* count_buff = buff + hash_entry_count;
326  auto qe_cuda_stream = getQueryEngineCudaStream();
328  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
329  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
331  SUFFIX(count_matches_sharded), count_buff, join_column, type_info, shard_info);
332 
333  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
334 
335  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
337  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
338  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
340  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
341  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
343  buff,
344  hash_entry_count,
345  join_column,
346  type_info,
347  shard_info);
348 }
349 
350 template <typename T, typename KEY_HANDLER>
352  const T* composite_key_dict,
353  const int64_t hash_entry_count,
354  const KEY_HANDLER* key_handler,
355  const size_t num_elems,
356  const bool for_window_framing) {
357  auto pos_buff = buff;
358  auto count_buff = buff + hash_entry_count;
359  auto qe_cuda_stream = getQueryEngineCudaStream();
361  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
362  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
363  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
364  count_buff,
365  composite_key_dict,
366  hash_entry_count,
367  key_handler,
368  num_elems);
369 
370  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
371 
372  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
374  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
375  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
377  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
378  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
379 
380  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
381  buff,
382  composite_key_dict,
383  hash_entry_count,
384  key_handler,
385  num_elems,
386  for_window_framing);
387 }
388 
389 template <typename T>
390 __global__ void init_baseline_hash_join_buff_wrapper(int8_t* hash_join_buff,
391  const int64_t entry_count,
392  const size_t key_component_count,
393  const bool with_val_slot,
394  const int32_t invalid_slot_val) {
395  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
396  entry_count,
397  key_component_count,
398  with_val_slot,
399  invalid_slot_val,
400  -1,
401  -1);
402 }
403 
404 void init_baseline_hash_join_buff_on_device_32(int8_t* hash_join_buff,
405  const int64_t entry_count,
406  const size_t key_component_count,
407  const bool with_val_slot,
408  const int32_t invalid_slot_val) {
409  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
410  hash_join_buff,
411  entry_count,
412  key_component_count,
413  with_val_slot,
414  invalid_slot_val);
415 }
416 
417 void init_baseline_hash_join_buff_on_device_64(int8_t* hash_join_buff,
418  const int64_t entry_count,
419  const size_t key_component_count,
420  const bool with_val_slot,
421  const int32_t invalid_slot_val) {
422  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
423  hash_join_buff,
424  entry_count,
425  key_component_count,
426  with_val_slot,
427  invalid_slot_val);
428 }
429 
430 template <typename T, typename KEY_HANDLER>
431 __global__ void fill_baseline_hash_join_buff_wrapper(int8_t* hash_buff,
432  const int64_t entry_count,
433  const int32_t invalid_slot_val,
434  const bool for_semi_join,
435  const size_t key_component_count,
436  const bool with_val_slot,
437  int* err,
438  const KEY_HANDLER* key_handler,
439  const int64_t num_elems) {
440  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
441  entry_count,
442  invalid_slot_val,
443  for_semi_join,
444  key_component_count,
445  with_val_slot,
446  key_handler,
447  num_elems,
448  -1,
449  -1);
450  atomicCAS(err, 0, partial_err);
451 }
452 
454  const int64_t entry_count,
455  const int32_t invalid_slot_val,
456  const bool for_semi_join,
457  const size_t key_component_count,
458  const bool with_val_slot,
459  int* dev_err_buff,
460  const GenericKeyHandler* key_handler,
461  const int64_t num_elems) {
463  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
464  hash_buff,
465  entry_count,
466  invalid_slot_val,
467  for_semi_join,
468  key_component_count,
469  with_val_slot,
470  dev_err_buff,
471  key_handler,
472  num_elems);
473 }
474 
476  const int64_t entry_count,
477  const int32_t invalid_slot_val,
478  const bool for_semi_join,
479  const size_t key_component_count,
480  const bool with_val_slot,
481  int* dev_err_buff,
482  const GenericKeyHandler* key_handler,
483  const int64_t num_elems) {
485  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
486  hash_buff,
487  entry_count,
488  invalid_slot_val,
489  for_semi_join,
490  key_component_count,
491  with_val_slot,
492  dev_err_buff,
493  key_handler,
494  num_elems);
495 }
496 
498  int8_t* hash_buff,
499  const int64_t entry_count,
500  const int32_t invalid_slot_val,
501  const size_t key_component_count,
502  const bool with_val_slot,
503  int* dev_err_buff,
504  const OverlapsKeyHandler* key_handler,
505  const int64_t num_elems) {
507  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
508  hash_buff,
509  entry_count,
510  invalid_slot_val,
511  false,
512  key_component_count,
513  with_val_slot,
514  dev_err_buff,
515  key_handler,
516  num_elems);
517 }
518 
520  const int64_t entry_count,
521  const int32_t invalid_slot_val,
522  const size_t key_component_count,
523  const bool with_val_slot,
524  int* dev_err_buff,
525  const RangeKeyHandler* key_handler,
526  const size_t num_elems) {
528  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
529  hash_buff,
530  entry_count,
531  invalid_slot_val,
532  false,
533  key_component_count,
534  with_val_slot,
535  dev_err_buff,
536  key_handler,
537  num_elems);
538 }
539 
541  int32_t* buff,
542  const int32_t* composite_key_dict,
543  const int64_t hash_entry_count,
544  const size_t key_component_count,
545  const GenericKeyHandler* key_handler,
546  const int64_t num_elems,
547  const bool for_window_framing) {
548  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
549  composite_key_dict,
550  hash_entry_count,
551  key_handler,
552  num_elems,
553  for_window_framing);
554 }
555 
557  int32_t* buff,
558  const int64_t* composite_key_dict,
559  const int64_t hash_entry_count,
560  const GenericKeyHandler* key_handler,
561  const int64_t num_elems,
562  const bool for_window_framing) {
563  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
564  composite_key_dict,
565  hash_entry_count,
566  key_handler,
567  num_elems,
568  for_window_framing);
569 }
570 
572  int32_t* buff,
573  const int64_t* composite_key_dict,
574  const int64_t hash_entry_count,
575  const OverlapsKeyHandler* key_handler,
576  const int64_t num_elems) {
577  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
578  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
579 }
580 
582  int32_t* buff,
583  const int64_t* composite_key_dict,
584  const size_t hash_entry_count,
585  const RangeKeyHandler* key_handler,
586  const size_t num_elems) {
587  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
588  buff, composite_key_dict, hash_entry_count, key_handler, num_elems, false);
589 }
590 
592  const uint32_t b,
593  int32_t* row_counts_buffer,
594  const OverlapsKeyHandler* key_handler,
595  const int64_t num_elems) {
596  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
597  hll_buffer,
598  row_counts_buffer,
599  b,
600  num_elems,
601  key_handler);
602 
603  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
605  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
606 }
607 
609  const uint32_t b,
610  int32_t* row_counts_buffer,
611  const RangeKeyHandler* key_handler,
612  const size_t num_elems,
613  const size_t block_size_x,
614  const size_t grid_size_x) {
615  auto qe_cuda_stream = getQueryEngineCudaStream();
616  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
617  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
618  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
619 
620  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
622  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
623 }
624 
625 void approximate_distinct_tuples_on_device(uint8_t* hll_buffer,
626  const uint32_t b,
627  const GenericKeyHandler* key_handler,
628  const int64_t num_elems) {
629  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
630  hll_buffer,
631  nullptr,
632  b,
633  num_elems,
634  key_handler);
635 }
636 
637 void compute_bucket_sizes_on_device(double* bucket_sizes_buffer,
638  const JoinColumn* join_column,
639  const JoinColumnTypeInfo* type_info,
640  const double* bucket_sz_threshold) {
641  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
642  bucket_sizes_buffer,
643  join_column,
644  type_info,
645  bucket_sz_threshold);
646 }
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
void fill_hash_join_buff_on_device_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_on_device_sharded(int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
void fill_one_to_many_baseline_hash_table_on_device(int32_t *buff, const SIZE *composite_key_dict, const size_t hash_entry_count, const size_t key_component_count, const KEY_HANDLER *key_handler, const size_t num_elems, const bool for_window_framing)
void * CUstream
Definition: nocuda.h:23
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_hash_join_buff_on_device_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
#define SUFFIX(name)
void range_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_on_device_bucketized(int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
void overlaps_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
void compute_bucket_sizes_on_device(double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void approximate_distinct_tuples_on_device_range(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
#define VALID_POS_FLAG
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
int64_t bucket_normalization
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems, const bool for_window_framing)
void init_hash_join_buff_on_device(int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
void fill_one_to_many_hash_table_on_device(int32_t *buff, const BucketizedHashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const bool for_window_framing)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void range_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void init_baseline_hash_join_buff_wrapper(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
size_t getNormalizedHashEntryCount() const
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_hash_join_buff_on_device(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void approximate_distinct_tuples_on_device_overlaps(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
void fill_hash_join_buff_on_device_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
size_t bucketized_hash_entry_count
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)