OmniSciDB  dfae7c3b14
HashJoinRuntimeGpu.cu
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "HashJoinRuntime.cpp"
17 
18 #include <thrust/device_ptr.h>
19 #include <thrust/scan.h>
20 
21 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
22 
23 template <typename F, typename... ARGS>
24 void cuda_kernel_launch_wrapper(F func, ARGS&&... args) {
25  int grid_size = -1;
26  int block_size = -1;
27  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, func));
28  func<<<grid_size, block_size>>>(std::forward<ARGS>(args)...);
29  checkCudaErrors(cudaGetLastError());
30 }
31 
32 __global__ void fill_hash_join_buff_wrapper(int32_t* buff,
33  const int32_t invalid_slot_val,
34  const JoinColumn join_column,
35  const JoinColumnTypeInfo type_info,
36  int* err) {
37  int partial_err = SUFFIX(fill_hash_join_buff)(
38  buff, invalid_slot_val, join_column, type_info, NULL, NULL, -1, -1);
39  atomicCAS(err, 0, partial_err);
40 }
41 
42 __global__ void fill_hash_join_buff_bucketized_wrapper(
43  int32_t* buff,
44  const int32_t invalid_slot_val,
45  const JoinColumn join_column,
46  const JoinColumnTypeInfo type_info,
47  int* err,
48  const int64_t bucket_normalization) {
49  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
50  invalid_slot_val,
51  join_column,
52  type_info,
53  NULL,
54  NULL,
55  -1,
56  -1,
57  bucket_normalization);
58  atomicCAS(err, 0, partial_err);
59 }
60 
61 void fill_hash_join_buff_on_device_bucketized(int32_t* buff,
62  const int32_t invalid_slot_val,
63  int* dev_err_buff,
64  const JoinColumn join_column,
65  const JoinColumnTypeInfo type_info,
66  const size_t block_size_x,
67  const size_t grid_size_x,
68  const int64_t bucket_normalization) {
69  cuda_kernel_launch_wrapper(fill_hash_join_buff_bucketized_wrapper,
70  buff,
71  invalid_slot_val,
72  join_column,
73  type_info,
74  dev_err_buff,
75  bucket_normalization);
76 }
77 
78 void fill_hash_join_buff_on_device(int32_t* buff,
79  const int32_t invalid_slot_val,
80  int* dev_err_buff,
81  const JoinColumn join_column,
82  const JoinColumnTypeInfo type_info,
83  const size_t block_size_x,
84  const size_t grid_size_x) {
85  cuda_kernel_launch_wrapper(fill_hash_join_buff_wrapper,
86  buff,
87  invalid_slot_val,
88  join_column,
89  type_info,
90  dev_err_buff);
91 }
92 
93 __global__ void fill_hash_join_buff_wrapper_sharded_bucketized(
94  int32_t* buff,
95  const int32_t invalid_slot_val,
96  const JoinColumn join_column,
97  const JoinColumnTypeInfo type_info,
98  const ShardInfo shard_info,
99  int* err,
100  const int64_t bucket_normalization) {
101  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
102  invalid_slot_val,
103  join_column,
104  type_info,
105  shard_info,
106  NULL,
107  NULL,
108  -1,
109  -1,
110  bucket_normalization);
111  atomicCAS(err, 0, partial_err);
112 }
113 
114 __global__ void fill_hash_join_buff_wrapper_sharded(int32_t* buff,
115  const int32_t invalid_slot_val,
116  const JoinColumn join_column,
117  const JoinColumnTypeInfo type_info,
118  const ShardInfo shard_info,
119  int* err) {
120  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(
121  buff, invalid_slot_val, join_column, type_info, shard_info, NULL, NULL, -1, -1);
122  atomicCAS(err, 0, partial_err);
123 }
124 
125 void fill_hash_join_buff_on_device_sharded_bucketized(
126  int32_t* buff,
127  const int32_t invalid_slot_val,
128  int* dev_err_buff,
129  const JoinColumn join_column,
130  const JoinColumnTypeInfo type_info,
131  const ShardInfo shard_info,
132  const size_t block_size_x,
133  const size_t grid_size_x,
134  const int64_t bucket_normalization) {
135  cuda_kernel_launch_wrapper(fill_hash_join_buff_wrapper_sharded_bucketized,
136  buff,
137  invalid_slot_val,
138  join_column,
139  type_info,
140  shard_info,
141  dev_err_buff,
142  bucket_normalization);
143 }
144 
145 void fill_hash_join_buff_on_device_sharded(int32_t* buff,
146  const int32_t invalid_slot_val,
147  int* dev_err_buff,
148  const JoinColumn join_column,
149  const JoinColumnTypeInfo type_info,
150  const ShardInfo shard_info,
151  const size_t block_size_x,
152  const size_t grid_size_x) {
153  cuda_kernel_launch_wrapper(fill_hash_join_buff_wrapper_sharded,
154  buff,
155  invalid_slot_val,
156  join_column,
157  type_info,
158  shard_info,
159  dev_err_buff);
160 }
161 
162 __global__ void init_hash_join_buff_wrapper(int32_t* buff,
163  const int64_t hash_entry_count,
164  const int32_t invalid_slot_val) {
165  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
166 }
167 
168 void init_hash_join_buff_on_device(int32_t* buff,
169  const int64_t hash_entry_count,
170  const int32_t invalid_slot_val,
171  const size_t block_size_x,
172  const size_t grid_size_x) {
173  cuda_kernel_launch_wrapper(
174  init_hash_join_buff_wrapper, buff, hash_entry_count, invalid_slot_val);
175 }
176 
177 #define VALID_POS_FLAG 0
178 
179 __global__ void set_valid_pos_flag(int32_t* pos_buff,
180  const int32_t* count_buff,
181  const int64_t entry_count) {
182  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
183  const int32_t step = blockDim.x * gridDim.x;
184  for (int64_t i = start; i < entry_count; i += step) {
185  if (count_buff[i]) {
186  pos_buff[i] = VALID_POS_FLAG;
187  }
188  }
189 }
190 
191 __global__ void set_valid_pos(int32_t* pos_buff,
192  int32_t* count_buff,
193  const int64_t entry_count) {
194  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
195  const int32_t step = blockDim.x * gridDim.x;
196  for (int64_t i = start; i < entry_count; i += step) {
197  if (VALID_POS_FLAG == pos_buff[i]) {
198  pos_buff[i] = !i ? 0 : count_buff[i - 1];
199  }
200  }
201 }
202 
203 template <typename COUNT_MATCHES_FUNCTOR, typename FILL_ROW_IDS_FUNCTOR>
204 void fill_one_to_many_hash_table_on_device_impl(int32_t* buff,
205  const int64_t hash_entry_count,
206  const int32_t invalid_slot_val,
207  const JoinColumn& join_column,
208  const JoinColumnTypeInfo& type_info,
209  const size_t block_size_x,
210  const size_t grid_size_x,
211  COUNT_MATCHES_FUNCTOR count_matches_func,
212  FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {
213  int32_t* pos_buff = buff;
214  int32_t* count_buff = buff + hash_entry_count;
215  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
216  count_matches_func();
217 
218  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
219 
220  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
221  thrust::inclusive_scan(
222  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
223 
224  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
225  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
226  fill_row_ids_func();
227 }
228 
229 void fill_one_to_many_hash_table_on_device(int32_t* buff,
230  const HashEntryInfo hash_entry_info,
231  const int32_t invalid_slot_val,
232  const JoinColumn& join_column,
233  const JoinColumnTypeInfo& type_info,
234  const size_t block_size_x,
235  const size_t grid_size_x) {
236  auto hash_entry_count = hash_entry_info.hash_entry_count;
237  auto count_matches_func = [hash_entry_count,
238  grid_size_x,
239  block_size_x,
240  count_buff = buff + hash_entry_count,
241  invalid_slot_val,
242  join_column,
243  type_info] {
244  cuda_kernel_launch_wrapper(
245  SUFFIX(count_matches), count_buff, invalid_slot_val, join_column, type_info);
246  };
247 
248  auto fill_row_ids_func = [grid_size_x,
249  block_size_x,
250  buff,
251  hash_entry_count,
252  invalid_slot_val,
253  join_column,
254  type_info] {
255  cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids),
256  buff,
257  hash_entry_count,
258  invalid_slot_val,
259  join_column,
260  type_info);
261  };
262 
263  fill_one_to_many_hash_table_on_device_impl(buff,
264  hash_entry_count,
265  invalid_slot_val,
266  join_column,
267  type_info,
268  block_size_x,
269  grid_size_x,
270  count_matches_func,
271  fill_row_ids_func);
272 }
273 
274 void fill_one_to_many_hash_table_on_device_bucketized(int32_t* buff,
275  const HashEntryInfo hash_entry_info,
276  const int32_t invalid_slot_val,
277  const JoinColumn& join_column,
278  const JoinColumnTypeInfo& type_info,
279  const size_t block_size_x,
280  const size_t grid_size_x) {
281  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
282  auto count_matches_func = [grid_size_x,
283  block_size_x,
284  count_buff = buff + hash_entry_count,
285  invalid_slot_val,
286  join_column,
287  type_info,
288  bucket_normalization =
289  hash_entry_info.bucket_normalization] {
290  cuda_kernel_launch_wrapper(SUFFIX(count_matches_bucketized),
291  count_buff,
292  invalid_slot_val,
293  join_column,
294  type_info,
295  bucket_normalization);
296  };
297 
298  auto fill_row_ids_func = [grid_size_x,
299  block_size_x,
300  buff,
301  hash_entry_count =
302  hash_entry_info.getNormalizedHashEntryCount(),
303  invalid_slot_val,
304  join_column,
305  type_info,
306  bucket_normalization = hash_entry_info.bucket_normalization] {
307  cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids_bucketized),
308  buff,
309  hash_entry_count,
310  invalid_slot_val,
311  join_column,
312  type_info,
313  bucket_normalization);
314  };
315 
316  fill_one_to_many_hash_table_on_device_impl(buff,
317  hash_entry_count,
318  invalid_slot_val,
319  join_column,
320  type_info,
321  block_size_x,
322  grid_size_x,
323  count_matches_func,
324  fill_row_ids_func);
325 }
326 
327 void fill_one_to_many_hash_table_on_device_sharded(int32_t* buff,
328  const HashEntryInfo hash_entry_info,
329  const int32_t invalid_slot_val,
330  const JoinColumn& join_column,
331  const JoinColumnTypeInfo& type_info,
332  const ShardInfo& shard_info,
333  const size_t block_size_x,
334  const size_t grid_size_x) {
335  auto hash_entry_count = hash_entry_info.hash_entry_count;
336  int32_t* pos_buff = buff;
337  int32_t* count_buff = buff + hash_entry_count;
338  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
339  cuda_kernel_launch_wrapper(SUFFIX(count_matches_sharded),
340  count_buff,
341  invalid_slot_val,
342  join_column,
343  type_info,
344  shard_info);
345 
346  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
347 
348  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
349  thrust::inclusive_scan(
350  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
351  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
352  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
353  cuda_kernel_launch_wrapper(SUFFIX(fill_row_ids_sharded),
354  buff,
355  hash_entry_count,
356  invalid_slot_val,
357  join_column,
358  type_info,
359  shard_info);
360 }
361 
362 template <typename T, typename KEY_HANDLER>
363 void fill_one_to_many_baseline_hash_table_on_device(int32_t* buff,
364  const T* composite_key_dict,
365  const int64_t hash_entry_count,
366  const int32_t invalid_slot_val,
367  const KEY_HANDLER* key_handler,
368  const size_t num_elems,
369  const size_t block_size_x,
370  const size_t grid_size_x) {
371  auto pos_buff = buff;
372  auto count_buff = buff + hash_entry_count;
373  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
374  cuda_kernel_launch_wrapper(count_matches_baseline_gpu<T, KEY_HANDLER>,
375  count_buff,
376  composite_key_dict,
377  hash_entry_count,
378  key_handler,
379  num_elems);
380 
381  cuda_kernel_launch_wrapper(set_valid_pos_flag, pos_buff, count_buff, hash_entry_count);
382 
383  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
384  thrust::inclusive_scan(
385  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
386  cuda_kernel_launch_wrapper(set_valid_pos, pos_buff, count_buff, hash_entry_count);
387  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
388 
389  cuda_kernel_launch_wrapper(fill_row_ids_baseline_gpu<T, KEY_HANDLER>,
390  buff,
391  composite_key_dict,
392  hash_entry_count,
393  invalid_slot_val,
394  key_handler,
395  num_elems);
396 }
397 
398 template <typename T>
399 __global__ void init_baseline_hash_join_buff_wrapper(int8_t* hash_join_buff,
400  const int64_t entry_count,
401  const size_t key_component_count,
402  const bool with_val_slot,
403  const int32_t invalid_slot_val) {
404  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
405  entry_count,
406  key_component_count,
407  with_val_slot,
408  invalid_slot_val,
409  -1,
410  -1);
411 }
412 
413 void init_baseline_hash_join_buff_on_device_32(int8_t* hash_join_buff,
414  const int64_t entry_count,
415  const size_t key_component_count,
416  const bool with_val_slot,
417  const int32_t invalid_slot_val,
418  const size_t block_size_x,
419  const size_t grid_size_x) {
420  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int32_t>,
421  hash_join_buff,
422  entry_count,
423  key_component_count,
424  with_val_slot,
425  invalid_slot_val);
426 }
427 
428 void init_baseline_hash_join_buff_on_device_64(int8_t* hash_join_buff,
429  const int64_t entry_count,
430  const size_t key_component_count,
431  const bool with_val_slot,
432  const int32_t invalid_slot_val,
433  const size_t block_size_x,
434  const size_t grid_size_x) {
435  cuda_kernel_launch_wrapper(init_baseline_hash_join_buff_wrapper<int64_t>,
436  hash_join_buff,
437  entry_count,
438  key_component_count,
439  with_val_slot,
440  invalid_slot_val);
441 }
442 
443 template <typename T, typename KEY_HANDLER>
444 __global__ void fill_baseline_hash_join_buff_wrapper(int8_t* hash_buff,
445  const int64_t entry_count,
446  const int32_t invalid_slot_val,
447  const size_t key_component_count,
448  const bool with_val_slot,
449  int* err,
450  const KEY_HANDLER* key_handler,
451  const int64_t num_elems) {
452  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
453  entry_count,
454  invalid_slot_val,
455  key_component_count,
456  with_val_slot,
457  key_handler,
458  num_elems,
459  -1,
460  -1);
461  atomicCAS(err, 0, partial_err);
462 }
463 
464 void fill_baseline_hash_join_buff_on_device_32(int8_t* hash_buff,
465  const int64_t entry_count,
466  const int32_t invalid_slot_val,
467  const size_t key_component_count,
468  const bool with_val_slot,
469  int* dev_err_buff,
470  const GenericKeyHandler* key_handler,
471  const int64_t num_elems,
472  const size_t block_size_x,
473  const size_t grid_size_x) {
474  cuda_kernel_launch_wrapper(
475  fill_baseline_hash_join_buff_wrapper<int32_t, GenericKeyHandler>,
476  hash_buff,
477  entry_count,
478  invalid_slot_val,
479  key_component_count,
480  with_val_slot,
481  dev_err_buff,
482  key_handler,
483  num_elems);
484 }
485 
486 void fill_baseline_hash_join_buff_on_device_64(int8_t* hash_buff,
487  const int64_t entry_count,
488  const int32_t invalid_slot_val,
489  const size_t key_component_count,
490  const bool with_val_slot,
491  int* dev_err_buff,
492  const GenericKeyHandler* key_handler,
493  const int64_t num_elems,
494  const size_t block_size_x,
495  const size_t grid_size_x) {
496  cuda_kernel_launch_wrapper(
497  fill_baseline_hash_join_buff_wrapper<unsigned long long, GenericKeyHandler>,
498  hash_buff,
499  entry_count,
500  invalid_slot_val,
501  key_component_count,
502  with_val_slot,
503  dev_err_buff,
504  key_handler,
505  num_elems);
506 }
507 
508 void overlaps_fill_baseline_hash_join_buff_on_device_64(
509  int8_t* hash_buff,
510  const int64_t entry_count,
511  const int32_t invalid_slot_val,
512  const size_t key_component_count,
513  const bool with_val_slot,
514  int* dev_err_buff,
515  const OverlapsKeyHandler* key_handler,
516  const int64_t num_elems,
517  const size_t block_size_x,
518  const size_t grid_size_x) {
519  cuda_kernel_launch_wrapper(
520  fill_baseline_hash_join_buff_wrapper<unsigned long long, OverlapsKeyHandler>,
521  hash_buff,
522  entry_count,
523  invalid_slot_val,
524  key_component_count,
525  with_val_slot,
526  dev_err_buff,
527  key_handler,
528  num_elems);
529 }
530 
531 void fill_one_to_many_baseline_hash_table_on_device_32(
532  int32_t* buff,
533  const int32_t* composite_key_dict,
534  const int64_t hash_entry_count,
535  const int32_t invalid_slot_val,
536  const size_t key_component_count,
537  const GenericKeyHandler* key_handler,
538  const int64_t num_elems,
539  const size_t block_size_x,
540  const size_t grid_size_x) {
541  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
542  composite_key_dict,
543  hash_entry_count,
544  invalid_slot_val,
545  key_handler,
546  num_elems,
547  block_size_x,
548  grid_size_x);
549 }
550 
551 void fill_one_to_many_baseline_hash_table_on_device_64(
552  int32_t* buff,
553  const int64_t* composite_key_dict,
554  const int64_t hash_entry_count,
555  const int32_t invalid_slot_val,
556  const GenericKeyHandler* key_handler,
557  const int64_t num_elems,
558  const size_t block_size_x,
559  const size_t grid_size_x) {
560  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
561  composite_key_dict,
562  hash_entry_count,
563  invalid_slot_val,
564  key_handler,
565  num_elems,
566  block_size_x,
567  grid_size_x);
568 }
569 
570 void overlaps_fill_one_to_many_baseline_hash_table_on_device_64(
571  int32_t* buff,
572  const int64_t* composite_key_dict,
573  const int64_t hash_entry_count,
574  const int32_t invalid_slot_val,
575  const OverlapsKeyHandler* key_handler,
576  const int64_t num_elems,
577  const size_t block_size_x,
578  const size_t grid_size_x) {
579  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
580  composite_key_dict,
581  hash_entry_count,
582  invalid_slot_val,
583  key_handler,
584  num_elems,
585  block_size_x,
586  grid_size_x);
587 }
588 
589 void approximate_distinct_tuples_on_device_overlaps(uint8_t* hll_buffer,
590  const uint32_t b,
591  int32_t* row_counts_buffer,
592  const OverlapsKeyHandler* key_handler,
593  const int64_t num_elems,
594  const size_t block_size_x,
595  const size_t grid_size_x) {
596  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<OverlapsKeyHandler>,
597  hll_buffer,
598  row_counts_buffer,
599  b,
600  num_elems,
601  key_handler);
602 
603  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
604  thrust::inclusive_scan(
605  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
606 }
607 
608 void approximate_distinct_tuples_on_device(uint8_t* hll_buffer,
609  const uint32_t b,
610  const GenericKeyHandler* key_handler,
611  const int64_t num_elems,
612  const size_t block_size_x,
613  const size_t grid_size_x) {
614  cuda_kernel_launch_wrapper(approximate_distinct_tuples_impl_gpu<GenericKeyHandler>,
615  hll_buffer,
616  nullptr,
617  b,
618  num_elems,
619  key_handler);
620 }
621 
622 void compute_bucket_sizes_on_device(double* bucket_sizes_buffer,
623  const JoinColumn* join_column,
624  const JoinColumnTypeInfo* type_info,
625  const double bucket_sz_threshold,
626  const size_t block_size_x,
627  const size_t grid_size_x) {
628  cuda_kernel_launch_wrapper(compute_bucket_sizes_impl_gpu<2>,
629  bucket_sizes_buffer,
630  join_column,
631  type_info,
632  bucket_sz_threshold,
633  block_size_x,
634  grid_size_x);
635 }