OmniSciDB  c0231cc57d
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoinRuntimeGpu.cu
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "HashJoinRuntime.cpp"
17 
18 #include <cuda.h>
20 
21 #include <thrust/device_ptr.h>
22 #include <thrust/scan.h>
23 
24 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
25 
26 template <typename F, typename... ARGS>
27 void cuda_kernel_launch_wrapper(F func, ARGS&&... args) {
28  int grid_size = -1;
29  int block_size = -1;
30  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
31  auto qe_cuda_stream = getQueryEngineCudaStream();
32  func<<<grid_size, block_size, 0, qe_cuda_stream>>>(std::forward<ARGS>(args)...);
33  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
34 }
35 
36 __global__ void fill_hash_join_buff_wrapper(int32_t* buff,
37  const int32_t invalid_slot_val,
38  const bool for_semi_join,
39  const JoinColumn join_column,
40  const JoinColumnTypeInfo type_info,
41  int* err) {
42  int partial_err = SUFFIX(fill_hash_join_buff)(
43  buff, invalid_slot_val, for_semi_join, join_column, type_info, NULL, NULL, -1, -1);
44  atomicCAS(err, 0, partial_err);
45 }
46 
48  int32_t* buff,
49  const int32_t invalid_slot_val,
50  const bool for_semi_join,
51  const JoinColumn join_column,
52  const JoinColumnTypeInfo type_info,
53  int* err,
54  const int64_t bucket_normalization) {
55  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
56  invalid_slot_val,
57  for_semi_join,
58  join_column,
59  type_info,
60  NULL,
61  NULL,
62  -1,
63  -1,
64  bucket_normalization);
65  atomicCAS(err, 0, partial_err);
66 }
67 
69  const int32_t invalid_slot_val,
70  const bool for_semi_join,
71  int* dev_err_buff,
72  const JoinColumn join_column,
73  const JoinColumnTypeInfo type_info,
74  const int64_t bucket_normalization) {
76  buff,
77  invalid_slot_val,
78  for_semi_join,
79  join_column,
80  type_info,
81  dev_err_buff,
82  bucket_normalization);
83 }
84 
85 void fill_hash_join_buff_on_device(int32_t* buff,
86  const int32_t invalid_slot_val,
87  const bool for_semi_join,
88  int* dev_err_buff,
89  const JoinColumn join_column,
90  const JoinColumnTypeInfo type_info) {
92  buff,
93  invalid_slot_val,
94  for_semi_join,
95  join_column,
96  type_info,
97  dev_err_buff);
98 }
99 
101  int32_t* buff,
102  const int32_t invalid_slot_val,
103  const bool for_semi_join,
104  const JoinColumn join_column,
105  const JoinColumnTypeInfo type_info,
106  const ShardInfo shard_info,
107  int* err,
108  const int64_t bucket_normalization) {
109  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
110  invalid_slot_val,
111  for_semi_join,
112  join_column,
113  type_info,
114  shard_info,
115  NULL,
116  NULL,
117  -1,
118  -1,
119  bucket_normalization);
120  atomicCAS(err, 0, partial_err);
121 }
122 
123 __global__ void fill_hash_join_buff_wrapper_sharded(int32_t* buff,
124  const int32_t invalid_slot_val,
125  const bool for_semi_join,
126  const JoinColumn join_column,
127  const JoinColumnTypeInfo type_info,
128  const ShardInfo shard_info,
129  int* err) {
130  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(buff,
131  invalid_slot_val,
132  for_semi_join,
133  join_column,
134  type_info,
135  shard_info,
136  NULL,
137  NULL,
138  -1,
139  -1);
140  atomicCAS(err, 0, partial_err);
141 }
142 
144  int32_t* buff,
145  const int32_t invalid_slot_val,
146  const bool for_semi_join,
147  int* dev_err_buff,
148  const JoinColumn join_column,
149  const JoinColumnTypeInfo type_info,
150  const ShardInfo shard_info,
151  const int64_t bucket_normalization) {
153  buff,
154  invalid_slot_val,
155  for_semi_join,
156  join_column,
157  type_info,
158  shard_info,
159  dev_err_buff,
160  bucket_normalization);
161 }
162 
164  const int32_t invalid_slot_val,
165  const bool for_semi_join,
166  int* dev_err_buff,
167  const JoinColumn join_column,
168  const JoinColumnTypeInfo type_info,
169  const ShardInfo shard_info) {
171  buff,
172  invalid_slot_val,
173  for_semi_join,
174  join_column,
175  type_info,
176  shard_info,
177  dev_err_buff);
178 }
179 
180 __global__ void init_hash_join_buff_wrapper(int32_t* buff,
181  const int64_t hash_entry_count,
182  const int32_t invalid_slot_val) {
183  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
184 }
185 
186 void init_hash_join_buff_on_device(int32_t* buff,
187  const int64_t hash_entry_count,
188  const int32_t invalid_slot_val) {
190  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
191 }
192 
193 #define VALID_POS_FLAG 0
194 
195 __global__ void set_valid_pos_flag(int32_t* pos_buff,
196  const int32_t* count_buff,
197  const int64_t entry_count) {
198  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
199  const int32_t step = blockDim.x * gridDim.x;
200  for (int64_t i = start; i < entry_count; i += step) {
201  if (count_buff[i]) {
202  pos_buff[i] = VALID_POS_FLAG;
203  }
204  }
205 }
206 
207 __global__ void set_valid_pos(int32_t* pos_buff,
208  int32_t* count_buff,
209  const int64_t entry_count) {
210  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
211  const int32_t step = blockDim.x * gridDim.x;
212  for (int64_t i = start; i < entry_count; i += step) {
213  if (VALID_POS_FLAG == pos_buff[i]) {
214  pos_buff[i] = !i ? 0 : count_buff[i - 1];
215  }
216  }
217 }
218 
219 template <typename COUNT_MATCHES_FUNCTOR, typename FILL_ROW_IDS_FUNCTOR>
221  const int64_t hash_entry_count,
222  const JoinColumn& join_column,
223  const JoinColumnTypeInfo& type_info,
224  COUNT_MATCHES_FUNCTOR count_matches_func,
225  FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {
226  int32_t* pos_buff = buff;
227  int32_t* count_buff = buff + hash_entry_count;
228  auto qe_cuda_stream = getQueryEngineCudaStream();
230  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
231  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
232  count_matches_func();
233 
234  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
235 
236  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
238  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
239 
240  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
242  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
243  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
244  fill_row_ids_func();
245 }
246 
248  const HashEntryInfo hash_entry_info,
249  const JoinColumn& join_column,
250  const JoinColumnTypeInfo& type_info) {
251  auto hash_entry_count = hash_entry_info.hash_entry_count;
252  auto count_matches_func = [count_buff = buff + hash_entry_count,
253  join_column,
254  type_info] {
255  cuda_kernel_launch_wrapper(SUFFIX(count_matches), count_buff, join_column, type_info);
256  };
257 
258  auto fill_row_ids_func = [buff, hash_entry_count, join_column, type_info] {
260  SUFFIX(fill_row_ids), buff, hash_entry_count, join_column, type_info);
261  };
262 
264  hash_entry_count,
265  join_column,
266  type_info,
267  count_matches_func,
268  fill_row_ids_func);
269 }
270 
272  int32_t* buff,
273  const HashEntryInfo hash_entry_info,
274  const JoinColumn& join_column,
275  const JoinColumnTypeInfo& type_info) {
276  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
277  auto count_matches_func = [count_buff = buff + hash_entry_count,
278  join_column,
279  type_info,
280  bucket_normalization =
281  hash_entry_info.bucket_normalization] {
283  count_buff,
284  join_column,
285  type_info,
286  bucket_normalization);
287  };
288 
289  auto fill_row_ids_func = [buff,
290  hash_entry_count =
291  hash_entry_info.getNormalizedHashEntryCount(),
292  join_column,
293  type_info,
294  bucket_normalization = hash_entry_info.bucket_normalization] {
296  buff,
297  hash_entry_count,
298  join_column,
299  type_info,
300  bucket_normalization);
301  };
302 
304  hash_entry_count,
305  join_column,
306  type_info,
307  count_matches_func,
308  fill_row_ids_func);
309 }
310 
312  const HashEntryInfo hash_entry_info,
313  const JoinColumn& join_column,
314  const JoinColumnTypeInfo& type_info,
315  const ShardInfo& shard_info) {
316  auto hash_entry_count = hash_entry_info.hash_entry_count;
317  int32_t* pos_buff = buff;
318  int32_t* count_buff = buff + hash_entry_count;
319  auto qe_cuda_stream = getQueryEngineCudaStream();
321  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
322  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
324  SUFFIX(count_matches_sharded), count_buff, join_column, type_info, shard_info);
325 
326  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
327 
328  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
330  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
331  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
333  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
334  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
336  buff,
337  hash_entry_count,
338  join_column,
339  type_info,
340  shard_info);
341 }
342 
343 template <typename T, typename KEY_HANDLER>
345  const T* composite_key_dict,
346  const int64_t hash_entry_count,
347  const KEY_HANDLER* key_handler,
348  const size_t num_elems) {
349  auto pos_buff = buff;
350  auto count_buff = buff + hash_entry_count;
351  auto qe_cuda_stream = getQueryEngineCudaStream();
353  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
354  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
355  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
356  count_buff,
357  composite_key_dict,
358  hash_entry_count,
359  key_handler,
360  num_elems);
361 
362  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
363 
364  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
366  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
367  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
369  cudaMemsetAsync(count_buff, 0, hash_entry_count * sizeof(int32_t), qe_cuda_stream));
370  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
371 
372  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
373  buff,
374  composite_key_dict,
375  hash_entry_count,
376  key_handler,
377  num_elems);
378 }
379 
380 template <typename T>
381 __global__ void init_baseline_hash_join_buff_wrapper(int8_t* hash_join_buff,
382  const int64_t entry_count,
383  const size_t key_component_count,
384  const bool with_val_slot,
385  const int32_t invalid_slot_val) {
386  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
387  entry_count,
388  key_component_count,
389  with_val_slot,
390  invalid_slot_val,
391  -1,
392  -1);
393 }
394 
395 void init_baseline_hash_join_buff_on_device_32(int8_t* hash_join_buff,
396  const int64_t entry_count,
397  const size_t key_component_count,
398  const bool with_val_slot,
399  const int32_t invalid_slot_val) {
400  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
401  hash_join_buff,
402  entry_count,
403  key_component_count,
404  with_val_slot,
405  invalid_slot_val);
406 }
407 
408 void init_baseline_hash_join_buff_on_device_64(int8_t* hash_join_buff,
409  const int64_t entry_count,
410  const size_t key_component_count,
411  const bool with_val_slot,
412  const int32_t invalid_slot_val) {
413  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
414  hash_join_buff,
415  entry_count,
416  key_component_count,
417  with_val_slot,
418  invalid_slot_val);
419 }
420 
421 template <typename T, typename KEY_HANDLER>
422 __global__ void fill_baseline_hash_join_buff_wrapper(int8_t* hash_buff,
423  const int64_t entry_count,
424  const int32_t invalid_slot_val,
425  const bool for_semi_join,
426  const size_t key_component_count,
427  const bool with_val_slot,
428  int* err,
429  const KEY_HANDLER* key_handler,
430  const int64_t num_elems) {
431  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
432  entry_count,
433  invalid_slot_val,
434  for_semi_join,
435  key_component_count,
436  with_val_slot,
437  key_handler,
438  num_elems,
439  -1,
440  -1);
441  atomicCAS(err, 0, partial_err);
442 }
443 
445  const int64_t entry_count,
446  const int32_t invalid_slot_val,
447  const bool for_semi_join,
448  const size_t key_component_count,
449  const bool with_val_slot,
450  int* dev_err_buff,
451  const GenericKeyHandler* key_handler,
452  const int64_t num_elems) {
454  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
455  hash_buff,
456  entry_count,
457  invalid_slot_val,
458  for_semi_join,
459  key_component_count,
460  with_val_slot,
461  dev_err_buff,
462  key_handler,
463  num_elems);
464 }
465 
467  const int64_t entry_count,
468  const int32_t invalid_slot_val,
469  const bool for_semi_join,
470  const size_t key_component_count,
471  const bool with_val_slot,
472  int* dev_err_buff,
473  const GenericKeyHandler* key_handler,
474  const int64_t num_elems) {
476  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
477  hash_buff,
478  entry_count,
479  invalid_slot_val,
480  for_semi_join,
481  key_component_count,
482  with_val_slot,
483  dev_err_buff,
484  key_handler,
485  num_elems);
486 }
487 
489  int8_t* hash_buff,
490  const int64_t entry_count,
491  const int32_t invalid_slot_val,
492  const size_t key_component_count,
493  const bool with_val_slot,
494  int* dev_err_buff,
495  const OverlapsKeyHandler* key_handler,
496  const int64_t num_elems) {
498  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
499  hash_buff,
500  entry_count,
501  invalid_slot_val,
502  false,
503  key_component_count,
504  with_val_slot,
505  dev_err_buff,
506  key_handler,
507  num_elems);
508 }
509 
511  const int64_t entry_count,
512  const int32_t invalid_slot_val,
513  const size_t key_component_count,
514  const bool with_val_slot,
515  int* dev_err_buff,
516  const RangeKeyHandler* key_handler,
517  const size_t num_elems) {
519  fill_baseline_hash_join_buff_wrapper<unsigned long long, RangeKeyHandler>,
520  hash_buff,
521  entry_count,
522  invalid_slot_val,
523  false,
524  key_component_count,
525  with_val_slot,
526  dev_err_buff,
527  key_handler,
528  num_elems);
529 }
530 
532  int32_t* buff,
533  const int32_t* composite_key_dict,
534  const int64_t hash_entry_count,
535  const size_t key_component_count,
536  const GenericKeyHandler* key_handler,
537  const int64_t num_elems) {
538  fill_one_to_many_baseline_hash_table_on_device<int32_t>(
539  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
540 }
541 
543  int32_t* buff,
544  const int64_t* composite_key_dict,
545  const int64_t hash_entry_count,
546  const GenericKeyHandler* key_handler,
547  const int64_t num_elems) {
548  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
549  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
550 }
551 
553  int32_t* buff,
554  const int64_t* composite_key_dict,
555  const int64_t hash_entry_count,
556  const OverlapsKeyHandler* key_handler,
557  const int64_t num_elems) {
558  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
559  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
560 }
561 
563  int32_t* buff,
564  const int64_t* composite_key_dict,
565  const size_t hash_entry_count,
566  const RangeKeyHandler* key_handler,
567  const size_t num_elems) {
568  fill_one_to_many_baseline_hash_table_on_device<int64_t>(
569  buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
570 }
571 
573  const uint32_t b,
574  int32_t* row_counts_buffer,
575  const OverlapsKeyHandler* key_handler,
576  const int64_t num_elems) {
577  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
578  hll_buffer,
579  row_counts_buffer,
580  b,
581  num_elems,
582  key_handler);
583 
584  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
586  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
587 }
588 
590  const uint32_t b,
591  int32_t* row_counts_buffer,
592  const RangeKeyHandler* key_handler,
593  const size_t num_elems,
594  const size_t block_size_x,
595  const size_t grid_size_x) {
596  auto qe_cuda_stream = getQueryEngineCudaStream();
597  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
598  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
599  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
600 
601  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
603  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
604 }
605 
606 void approximate_distinct_tuples_on_device(uint8_t* hll_buffer,
607  const uint32_t b,
608  const GenericKeyHandler* key_handler,
609  const int64_t num_elems) {
610  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
611  hll_buffer,
612  nullptr,
613  b,
614  num_elems,
615  key_handler);
616 }
617 
618 void compute_bucket_sizes_on_device(double* bucket_sizes_buffer,
619  const JoinColumn* join_column,
620  const JoinColumnTypeInfo* type_info,
621  const double* bucket_sz_threshold) {
622  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
623  bucket_sizes_buffer,
624  join_column,
625  type_info,
626  bucket_sz_threshold);
627 }
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const int64_t num_elems)
void fill_hash_join_buff_on_device_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int64_t bucket_normalization)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_one_to_many_hash_table_on_device(int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
void fill_one_to_many_baseline_hash_table_on_device(int32_t *buff, const SIZE *composite_key_dict, const size_t hash_entry_count, const size_t key_component_count, const KEY_HANDLER *key_handler, const size_t num_elems)
void * CUstream
Definition: nocuda.h:23
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
void fill_hash_join_buff_on_device_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info)
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
#define SUFFIX(name)
void range_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const RangeKeyHandler *key_handler, const size_t num_elems)
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void overlaps_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const GenericKeyHandler *key_handler, const int64_t num_elems)
void compute_bucket_sizes_on_device(double *bucket_sizes_buffer, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds)
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void approximate_distinct_tuples_on_device_range(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
#define VALID_POS_FLAG
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
int64_t bucket_normalization
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_on_device_bucketized(int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info)
size_t hash_entry_count
void init_hash_join_buff_on_device(int32_t *buff, const int64_t entry_count, const int32_t invalid_slot_val)
void fill_one_to_many_hash_table_on_device_sharded(int32_t *buff, const HashEntryInfo hash_entry_info, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void range_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const RangeKeyHandler *key_handler, const size_t num_elems)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void init_baseline_hash_join_buff_wrapper(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int64_t entry_count)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void cuda_kernel_launch_wrapper(F func, ARGS &&...args)
size_t getNormalizedHashEntryCount() const
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const int64_t num_elems)
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const int64_t num_elems)
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_hash_join_buff_on_device(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void approximate_distinct_tuples_on_device_overlaps(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const int64_t num_elems)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int64_t hash_entry_count, const int32_t invalid_slot_val)
void fill_hash_join_buff_on_device_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int64_t bucket_normalization)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int64_t entry_count)