OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HashJoinRuntimeGpu.cu
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "HashJoinRuntime.cpp"
17 
18 #include <thrust/device_ptr.h>
19 #include <thrust/scan.h>
20 
21 __global__ void fill_hash_join_buff_wrapper(int32_t* buff,
22  const int32_t invalid_slot_val,
23  const JoinColumn join_column,
24  const JoinColumnTypeInfo type_info,
25  int* err) {
26  int partial_err = SUFFIX(fill_hash_join_buff)(
27  buff, invalid_slot_val, join_column, type_info, NULL, NULL, -1, -1);
28  atomicCAS(err, 0, partial_err);
29 }
30 
32  int32_t* buff,
33  const int32_t invalid_slot_val,
34  const JoinColumn join_column,
35  const JoinColumnTypeInfo type_info,
36  int* err,
37  const int64_t bucket_normalization) {
38  int partial_err = SUFFIX(fill_hash_join_buff_bucketized)(buff,
39  invalid_slot_val,
40  join_column,
41  type_info,
42  NULL,
43  NULL,
44  -1,
45  -1,
46  bucket_normalization);
47  atomicCAS(err, 0, partial_err);
48 }
49 
51  const int32_t invalid_slot_val,
52  int* dev_err_buff,
53  const JoinColumn join_column,
54  const JoinColumnTypeInfo type_info,
55  const size_t block_size_x,
56  const size_t grid_size_x,
57  const int64_t bucket_normalization) {
58  fill_hash_join_buff_bucketized_wrapper<<<grid_size_x, block_size_x>>>(
59  buff, invalid_slot_val, join_column, type_info, dev_err_buff, bucket_normalization);
60 }
61 
62 void fill_hash_join_buff_on_device(int32_t* buff,
63  const int32_t invalid_slot_val,
64  int* dev_err_buff,
65  const JoinColumn join_column,
66  const JoinColumnTypeInfo type_info,
67  const size_t block_size_x,
68  const size_t grid_size_x) {
69  fill_hash_join_buff_wrapper<<<grid_size_x, block_size_x>>>(
70  buff, invalid_slot_val, join_column, type_info, dev_err_buff);
71 }
72 
74  int32_t* buff,
75  const int32_t invalid_slot_val,
76  const JoinColumn join_column,
77  const JoinColumnTypeInfo type_info,
78  const ShardInfo shard_info,
79  int* err,
80  const int64_t bucket_normalization) {
81  int partial_err = SUFFIX(fill_hash_join_buff_sharded_bucketized)(buff,
82  invalid_slot_val,
83  join_column,
84  type_info,
85  shard_info,
86  NULL,
87  NULL,
88  -1,
89  -1,
90  bucket_normalization);
91  atomicCAS(err, 0, partial_err);
92 }
93 
94 __global__ void fill_hash_join_buff_wrapper_sharded(int32_t* buff,
95  const int32_t invalid_slot_val,
96  const JoinColumn join_column,
97  const JoinColumnTypeInfo type_info,
98  const ShardInfo shard_info,
99  int* err) {
100  int partial_err = SUFFIX(fill_hash_join_buff_sharded)(
101  buff, invalid_slot_val, join_column, type_info, shard_info, NULL, NULL, -1, -1);
102  atomicCAS(err, 0, partial_err);
103 }
104 
106  int32_t* buff,
107  const int32_t invalid_slot_val,
108  int* dev_err_buff,
109  const JoinColumn join_column,
110  const JoinColumnTypeInfo type_info,
111  const ShardInfo shard_info,
112  const size_t block_size_x,
113  const size_t grid_size_x,
114  const int64_t bucket_normalization) {
115  fill_hash_join_buff_wrapper_sharded_bucketized<<<grid_size_x, block_size_x>>>(
116  buff,
117  invalid_slot_val,
118  join_column,
119  type_info,
120  shard_info,
121  dev_err_buff,
122  bucket_normalization);
123 }
124 
126  const int32_t invalid_slot_val,
127  int* dev_err_buff,
128  const JoinColumn join_column,
129  const JoinColumnTypeInfo type_info,
130  const ShardInfo shard_info,
131  const size_t block_size_x,
132  const size_t grid_size_x) {
133  fill_hash_join_buff_wrapper_sharded<<<grid_size_x, block_size_x>>>(
134  buff, invalid_slot_val, join_column, type_info, shard_info, dev_err_buff);
135 }
136 
137 __global__ void init_hash_join_buff_wrapper(int32_t* buff,
138  const int32_t hash_entry_count,
139  const int32_t invalid_slot_val) {
140  SUFFIX(init_hash_join_buff)(buff, hash_entry_count, invalid_slot_val, -1, -1);
141 }
142 
143 void init_hash_join_buff_on_device(int32_t* buff,
144  const int32_t hash_entry_count,
145  const int32_t invalid_slot_val,
146  const size_t block_size_x,
147  const size_t grid_size_x) {
148  init_hash_join_buff_wrapper<<<grid_size_x, block_size_x>>>(
149  buff, hash_entry_count, invalid_slot_val);
150 }
151 
152 #define VALID_POS_FLAG 0
153 
154 __global__ void set_valid_pos_flag(int32_t* pos_buff,
155  const int32_t* count_buff,
156  const int32_t entry_count) {
157  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
158  const int32_t step = blockDim.x * gridDim.x;
159  for (int32_t i = start; i < entry_count; i += step) {
160  if (count_buff[i]) {
161  pos_buff[i] = VALID_POS_FLAG;
162  }
163  }
164 }
165 
166 __global__ void set_valid_pos(int32_t* pos_buff,
167  int32_t* count_buff,
168  const int32_t entry_count) {
169  const int32_t start = threadIdx.x + blockDim.x * blockIdx.x;
170  const int32_t step = blockDim.x * gridDim.x;
171  for (int32_t i = start; i < entry_count; i += step) {
172  if (VALID_POS_FLAG == pos_buff[i]) {
173  pos_buff[i] = !i ? 0 : count_buff[i - 1];
174  }
175  }
176 }
177 
178 template <typename COUNT_MATCHES_FUNCTOR, typename FILL_ROW_IDS_FUNCTOR>
180  const int32_t hash_entry_count,
181  const int32_t invalid_slot_val,
182  const JoinColumn& join_column,
183  const JoinColumnTypeInfo& type_info,
184  const size_t block_size_x,
185  const size_t grid_size_x,
186  COUNT_MATCHES_FUNCTOR count_matches_func,
187  FILL_ROW_IDS_FUNCTOR fill_row_ids_func) {
188  int32_t* pos_buff = buff;
189  int32_t* count_buff = buff + hash_entry_count;
190  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
191  count_matches_func();
192 
193  set_valid_pos_flag<<<grid_size_x, block_size_x>>>(
194  pos_buff, count_buff, hash_entry_count);
195 
196  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
198  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
199  set_valid_pos<<<grid_size_x, block_size_x>>>(pos_buff, count_buff, hash_entry_count);
200  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
201  fill_row_ids_func();
202 }
203 
205  const HashEntryInfo hash_entry_info,
206  const int32_t invalid_slot_val,
207  const JoinColumn& join_column,
208  const JoinColumnTypeInfo& type_info,
209  const size_t block_size_x,
210  const size_t grid_size_x) {
211  auto hash_entry_count = hash_entry_info.hash_entry_count;
212  auto count_matches_func = [hash_entry_count,
213  grid_size_x,
214  block_size_x,
215  count_buff = buff + hash_entry_count,
216  invalid_slot_val,
217  join_column,
218  type_info] {
219  SUFFIX(count_matches)<<<grid_size_x, block_size_x>>>(
220  count_buff, invalid_slot_val, join_column, type_info);
221  };
222 
223  auto fill_row_ids_func = [grid_size_x,
224  block_size_x,
225  buff,
226  hash_entry_count,
227  invalid_slot_val,
228  join_column,
229  type_info] {
230  SUFFIX(fill_row_ids)<<<grid_size_x, block_size_x>>>(
231  buff, hash_entry_count, invalid_slot_val, join_column, type_info);
232  };
233 
235  hash_entry_count,
236  invalid_slot_val,
237  join_column,
238  type_info,
239  block_size_x,
240  grid_size_x,
241  count_matches_func,
242  fill_row_ids_func);
243 }
244 
246  const HashEntryInfo hash_entry_info,
247  const int32_t invalid_slot_val,
248  const JoinColumn& join_column,
249  const JoinColumnTypeInfo& type_info,
250  const size_t block_size_x,
251  const size_t grid_size_x) {
252  auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();
253  auto count_matches_func = [grid_size_x,
254  block_size_x,
255  count_buff = buff + hash_entry_count,
256  invalid_slot_val,
257  join_column,
258  type_info,
259  bucket_normalization =
260  hash_entry_info.bucket_normalization] {
261  SUFFIX(count_matches_bucketized)<<<grid_size_x, block_size_x>>>(
262  count_buff, invalid_slot_val, join_column, type_info, bucket_normalization);
263  };
264 
265  auto fill_row_ids_func = [grid_size_x,
266  block_size_x,
267  buff,
268  hash_entry_count =
269  hash_entry_info.getNormalizedHashEntryCount(),
270  invalid_slot_val,
271  join_column,
272  type_info,
273  bucket_normalization = hash_entry_info.bucket_normalization] {
274  SUFFIX(fill_row_ids_bucketized)<<<grid_size_x, block_size_x>>>(buff,
275  hash_entry_count,
276  invalid_slot_val,
277  join_column,
278  type_info,
279  bucket_normalization);
280  };
281 
283  hash_entry_count,
284  invalid_slot_val,
285  join_column,
286  type_info,
287  block_size_x,
288  grid_size_x,
289  count_matches_func,
290  fill_row_ids_func);
291 }
292 
294  const HashEntryInfo hash_entry_info,
295  const int32_t invalid_slot_val,
296  const JoinColumn& join_column,
297  const JoinColumnTypeInfo& type_info,
298  const ShardInfo& shard_info,
299  const size_t block_size_x,
300  const size_t grid_size_x) {
301  auto hash_entry_count = hash_entry_info.hash_entry_count;
302  int32_t* pos_buff = buff;
303  int32_t* count_buff = buff + hash_entry_count;
304  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
305  SUFFIX(count_matches_sharded)<<<grid_size_x, block_size_x>>>(
306  count_buff, invalid_slot_val, join_column, type_info, shard_info);
307 
308  set_valid_pos_flag<<<grid_size_x, block_size_x>>>(
309  pos_buff, count_buff, hash_entry_count);
310 
311  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
313  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
314  set_valid_pos<<<grid_size_x, block_size_x>>>(pos_buff, count_buff, hash_entry_count);
315  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
316  SUFFIX(fill_row_ids_sharded)<<<grid_size_x, block_size_x>>>(
317  buff, hash_entry_count, invalid_slot_val, join_column, type_info, shard_info);
318 }
319 
320 template <typename T, typename KEY_HANDLER>
322  const T* composite_key_dict,
323  const size_t hash_entry_count,
324  const int32_t invalid_slot_val,
325  const KEY_HANDLER* key_handler,
326  const size_t num_elems,
327  const size_t block_size_x,
328  const size_t grid_size_x) {
329  auto pos_buff = buff;
330  auto count_buff = buff + hash_entry_count;
331  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
332  count_matches_baseline_gpu<<<grid_size_x, block_size_x>>>(
333  count_buff, composite_key_dict, hash_entry_count, key_handler, num_elems);
334 
335  set_valid_pos_flag<<<grid_size_x, block_size_x>>>(
336  pos_buff, count_buff, hash_entry_count);
337 
338  auto count_buff_dev_ptr = thrust::device_pointer_cast(count_buff);
340  count_buff_dev_ptr, count_buff_dev_ptr + hash_entry_count, count_buff_dev_ptr);
341  set_valid_pos<<<grid_size_x, block_size_x>>>(pos_buff, count_buff, hash_entry_count);
342  cudaMemset(count_buff, 0, hash_entry_count * sizeof(int32_t));
343  fill_row_ids_baseline_gpu<<<grid_size_x, block_size_x>>>(buff,
344  composite_key_dict,
345  hash_entry_count,
346  invalid_slot_val,
347  key_handler,
348  num_elems);
349 }
350 
351 template <typename T>
352 __global__ void init_baseline_hash_join_buff_wrapper(int8_t* hash_join_buff,
353  const size_t entry_count,
354  const size_t key_component_count,
355  const bool with_val_slot,
356  const int32_t invalid_slot_val) {
357  SUFFIX(init_baseline_hash_join_buff)<T>(hash_join_buff,
358  entry_count,
359  key_component_count,
360  with_val_slot,
361  invalid_slot_val,
362  -1,
363  -1);
364 }
365 
366 void init_baseline_hash_join_buff_on_device_32(int8_t* hash_join_buff,
367  const int32_t entry_count,
368  const size_t key_component_count,
369  const bool with_val_slot,
370  const int32_t invalid_slot_val,
371  const size_t block_size_x,
372  const size_t grid_size_x) {
373  init_baseline_hash_join_buff_wrapper<int32_t><<<grid_size_x, block_size_x>>>(
374  hash_join_buff, entry_count, key_component_count, with_val_slot, invalid_slot_val);
375 }
376 
377 void init_baseline_hash_join_buff_on_device_64(int8_t* hash_join_buff,
378  const int32_t entry_count,
379  const size_t key_component_count,
380  const bool with_val_slot,
381  const int32_t invalid_slot_val,
382  const size_t block_size_x,
383  const size_t grid_size_x) {
384  init_baseline_hash_join_buff_wrapper<int64_t><<<grid_size_x, block_size_x>>>(
385  hash_join_buff, entry_count, key_component_count, with_val_slot, invalid_slot_val);
386 }
387 
388 template <typename T, typename KEY_HANDLER>
389 __global__ void fill_baseline_hash_join_buff_wrapper(int8_t* hash_buff,
390  const size_t entry_count,
391  const int32_t invalid_slot_val,
392  const size_t key_component_count,
393  const bool with_val_slot,
394  int* err,
395  const KEY_HANDLER* key_handler,
396  const size_t num_elems) {
397  int partial_err = SUFFIX(fill_baseline_hash_join_buff)<T>(hash_buff,
398  entry_count,
399  invalid_slot_val,
400  key_component_count,
401  with_val_slot,
402  key_handler,
403  num_elems,
404  -1,
405  -1);
406  atomicCAS(err, 0, partial_err);
407 }
408 
410  const size_t entry_count,
411  const int32_t invalid_slot_val,
412  const size_t key_component_count,
413  const bool with_val_slot,
414  int* dev_err_buff,
415  const GenericKeyHandler* key_handler,
416  const size_t num_elems,
417  const size_t block_size_x,
418  const size_t grid_size_x) {
419  fill_baseline_hash_join_buff_wrapper<int32_t>
420  <<<grid_size_x, block_size_x>>>(hash_buff,
421  entry_count,
422  invalid_slot_val,
423  key_component_count,
424  with_val_slot,
425  dev_err_buff,
426  key_handler,
427  num_elems);
428 }
429 
431  const size_t entry_count,
432  const int32_t invalid_slot_val,
433  const size_t key_component_count,
434  const bool with_val_slot,
435  int* dev_err_buff,
436  const GenericKeyHandler* key_handler,
437  const size_t num_elems,
438  const size_t block_size_x,
439  const size_t grid_size_x) {
440  fill_baseline_hash_join_buff_wrapper<unsigned long long>
441  <<<grid_size_x, block_size_x>>>(hash_buff,
442  entry_count,
443  invalid_slot_val,
444  key_component_count,
445  with_val_slot,
446  dev_err_buff,
447  key_handler,
448  num_elems);
449 }
450 
452  int8_t* hash_buff,
453  const size_t entry_count,
454  const int32_t invalid_slot_val,
455  const size_t key_component_count,
456  const bool with_val_slot,
457  int* dev_err_buff,
458  const OverlapsKeyHandler* key_handler,
459  const size_t num_elems,
460  const size_t block_size_x,
461  const size_t grid_size_x) {
462  fill_baseline_hash_join_buff_wrapper<unsigned long long>
463  <<<grid_size_x, block_size_x>>>(hash_buff,
464  entry_count,
465  invalid_slot_val,
466  key_component_count,
467  with_val_slot,
468  dev_err_buff,
469  key_handler,
470  num_elems);
471 }
472 
474  int32_t* buff,
475  const int32_t* composite_key_dict,
476  const size_t hash_entry_count,
477  const int32_t invalid_slot_val,
478  const size_t key_component_count,
479  const GenericKeyHandler* key_handler,
480  const size_t num_elems,
481  const size_t block_size_x,
482  const size_t grid_size_x) {
483  fill_one_to_many_baseline_hash_table_on_device<int32_t>(buff,
484  composite_key_dict,
485  hash_entry_count,
486  invalid_slot_val,
487  key_handler,
488  num_elems,
489  block_size_x,
490  grid_size_x);
491 }
492 
494  int32_t* buff,
495  const int64_t* composite_key_dict,
496  const size_t hash_entry_count,
497  const int32_t invalid_slot_val,
498  const GenericKeyHandler* key_handler,
499  const size_t num_elems,
500  const size_t block_size_x,
501  const size_t grid_size_x) {
502  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
503  composite_key_dict,
504  hash_entry_count,
505  invalid_slot_val,
506  key_handler,
507  num_elems,
508  block_size_x,
509  grid_size_x);
510 }
511 
513  int32_t* buff,
514  const int64_t* composite_key_dict,
515  const size_t hash_entry_count,
516  const int32_t invalid_slot_val,
517  const OverlapsKeyHandler* key_handler,
518  const size_t num_elems,
519  const size_t block_size_x,
520  const size_t grid_size_x) {
521  fill_one_to_many_baseline_hash_table_on_device<int64_t>(buff,
522  composite_key_dict,
523  hash_entry_count,
524  invalid_slot_val,
525  key_handler,
526  num_elems,
527  block_size_x,
528  grid_size_x);
529 }
530 
532  const uint32_t b,
533  int32_t* row_counts_buffer,
534  const OverlapsKeyHandler* key_handler,
535  const size_t num_elems,
536  const size_t block_size_x,
537  const size_t grid_size_x) {
538  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x>>>(
539  hll_buffer, row_counts_buffer, b, num_elems, key_handler);
540 
541  auto row_counts_buffer_ptr = thrust::device_pointer_cast(row_counts_buffer);
543  row_counts_buffer_ptr, row_counts_buffer_ptr + num_elems, row_counts_buffer_ptr);
544 }
545 
546 void approximate_distinct_tuples_on_device(uint8_t* hll_buffer,
547  const uint32_t b,
548  const GenericKeyHandler* key_handler,
549  const size_t num_elems,
550  const size_t block_size_x,
551  const size_t grid_size_x) {
552  approximate_distinct_tuples_impl_gpu<<<grid_size_x, block_size_x>>>(
553  hll_buffer, nullptr, b, num_elems, key_handler);
554 }
555 
556 void compute_bucket_sizes_on_device(double* bucket_sizes_buffer,
557  const JoinColumn* join_column,
558  const double bucket_sz_threshold,
559  const size_t block_size_x,
560  const size_t grid_size_x) {
561  compute_bucket_sizes_impl_gpu<2><<<grid_size_x, block_size_x>>>(
562  bucket_sizes_buffer, join_column, bucket_sz_threshold, block_size_x, grid_size_x);
563 }
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
void fill_one_to_many_hash_table_on_device(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const size_t block_size_x, const size_t grid_size_x)
void overlaps_fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const OverlapsKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int32_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_on_device_sharded(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const size_t block_size_x, const size_t grid_size_x)
void fill_hash_join_buff_on_device(int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const size_t block_size_x, const size_t grid_size_x)
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int32_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int32_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
__global__ void set_valid_pos(int32_t *pos_buff, int32_t *count_buff, const int32_t entry_count)
#define SUFFIX(name)
__global__ void fill_hash_join_buff_wrapper_sharded(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err)
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
__global__ void fill_hash_join_buff_bucketized_wrapper(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err, const int64_t bucket_normalization)
__global__ void fill_hash_join_buff_wrapper(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, int *err)
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
DEVICE int SUFFIX() fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const FILL_HANDLER *f, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_on_device_impl(int32_t *buff, const int32_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const size_t block_size_x, const size_t grid_size_x, COUNT_MATCHES_FUNCTOR count_matches_func, FILL_ROW_IDS_FUNCTOR fill_row_ids_func)
__global__ void fill_baseline_hash_join_buff_wrapper(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *err, const KEY_HANDLER *key_handler, const size_t num_elems)
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
#define VALID_POS_FLAG
void fill_hash_join_buff_on_device_sharded(int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const size_t block_size_x, const size_t grid_size_x)
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int32_t hash_entry_count, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_hash_table_on_device_bucketized(int32_t *buff, const HashEntryInfo hash_entry_info, const int32_t invalid_slot_val, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const size_t block_size_x, const size_t grid_size_x)
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int32_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t block_size_x, const size_t grid_size_x)
int64_t bucket_normalization
size_t hash_entry_count
void fill_one_to_many_baseline_hash_table_on_device(int32_t *buff, const T *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const KEY_HANDLER *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int32_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t block_size_x, const size_t grid_size_x)
__global__ void set_valid_pos_flag(int32_t *pos_buff, const int32_t *count_buff, const int32_t entry_count)
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
void init_hash_join_buff_on_device(int32_t *buff, const int32_t entry_count, const int32_t invalid_slot_val, const size_t block_size_x, const size_t grid_size_x)
DEVICE int SUFFIX() fill_hash_join_buff(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
size_t getNormalizedHashEntryCount() const
void approximate_distinct_tuples_on_device_overlaps(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const OverlapsKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
__global__ void fill_hash_join_buff_wrapper_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, int *err, const int64_t bucket_normalization)
__global__ void init_hash_join_buff_wrapper(int32_t *buff, const int32_t hash_entry_count, const int32_t invalid_slot_val)
void overlaps_fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const OverlapsKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
__global__ void init_baseline_hash_join_buff_wrapper(int8_t *hash_join_buff, const size_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val)
void compute_bucket_sizes_on_device(double *bucket_sizes_buffer, const JoinColumn *join_column_for_key, const double bucket_sz_threshold, const size_t block_size_x, const size_t grid_size_x)
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const void *sd_inner_proxy, const void *sd_outer_proxy, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_hash_join_buff_on_device_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const size_t block_size_x, const size_t grid_size_x, const int64_t bucket_normalization)
void fill_hash_join_buff_on_device_bucketized(int32_t *buff, const int32_t invalid_slot_val, int *dev_err_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const size_t block_size_x, const size_t grid_size_x, const int64_t bucket_normalization)