12 #if CUDA_VERSION < 10000
13 static_assert(
false,
"CUDA v10.0 or later is required.");
16 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 350)
17 static_assert(
false,
"CUDA Compute Capability of 3.5 or greater is required.");
28 extern "C" __device__ int32_t
pos_start_impl(
const int32_t* row_index_resume) {
29 return blockIdx.x * blockDim.x + threadIdx.x;
37 return blockDim.x * gridDim.x;
41 return threadIdx.x % warp_sz;
45 const int64_t* groups_buffer,
46 const int32_t groups_buffer_size) {
58 extern __shared__ int64_t shared_mem_buffer[];
59 return shared_mem_buffer;
67 extern "C" __device__
const int64_t*
init_shared_mem(
const int64_t* global_groups_buffer,
68 const int32_t groups_buffer_size) {
70 extern __shared__ int64_t shared_groups_buffer[];
74 const int32_t buffer_units = groups_buffer_size >> 3;
76 for (int32_t pos = threadIdx.x; pos < buffer_units; pos += blockDim.x) {
77 shared_groups_buffer[pos] = global_groups_buffer[pos];
80 return shared_groups_buffer;
83 #define init_group_by_buffer_gpu_impl init_group_by_buffer_gpu
87 #undef init_group_by_buffer_gpu_impl
98 __inline__ __device__ uint32_t
get_smid(
void) {
100 asm(
"mov.u32 %0, %%smid;" :
"=r"(ret));
128 __shared__
volatile int64_t dw_block_cycle_start;
129 __shared__
volatile bool
136 if (threadIdx.x == 0) {
137 dw_block_cycle_start = 0LL;
138 int64_t cycle_count =
static_cast<int64_t
>(clock64());
141 dw_block_cycle_start =
static_cast<int64_t
>(
144 static_cast<unsigned long long>(cycle_count)));
147 int64_t cycles = cycle_count - dw_block_cycle_start;
148 if ((smid ==
get_smid()) && (dw_block_cycle_start > 0LL) &&
151 dw_should_terminate =
true;
153 dw_should_terminate =
false;
157 return dw_should_terminate;
164 template <
typename T =
unsigned long long>
174 template <
typename T>
178 const uint32_t key_count,
179 const uint32_t row_size_quad) {
180 const T empty_key = get_empty_key<T>();
181 uint32_t off = h * row_size_quad;
182 auto row_ptr =
reinterpret_cast<T*
>(groups_buffer + off);
184 const T old = atomicCAS(row_ptr, empty_key, *key);
185 if (empty_key == old && key_count > 1) {
186 for (
size_t i = 1; i <= key_count - 1; ++i) {
187 atomicExch(row_ptr + i, key[i]);
192 while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
198 for (uint32_t i = 0; i < key_count; ++i) {
199 if (row_ptr[i] != key[i]) {
206 auto row_ptr_i8 =
reinterpret_cast<int8_t*
>(row_ptr + key_count);
215 const uint32_t key_count,
216 const uint32_t key_width,
217 const uint32_t row_size_quad) {
222 reinterpret_cast<const unsigned int*>(key),
228 reinterpret_cast<const unsigned long long*>(key),
236 template <
typename T>
238 const uint32_t entry_count,
241 const uint32_t key_count) {
242 const T empty_key = get_empty_key<T>();
244 atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
246 if (old == empty_key) {
247 uint32_t offset = h + entry_count;
248 for (
size_t i = 1; i < key_count; ++i) {
249 *
reinterpret_cast<T*
>(groups_buffer + offset) = key[i];
250 offset += entry_count;
258 if (old != empty_key) {
260 for (uint32_t i = 0; i < key_count; ++i) {
261 if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
264 offset += entry_count;
270 extern "C" __device__ int32_t
272 const uint32_t entry_count,
275 const uint32_t key_count,
276 const uint32_t key_width) {
283 reinterpret_cast<const unsigned int*>(key),
290 reinterpret_cast<const unsigned long long*>(key),
298 int64_t* groups_buffer,
301 const uint32_t key_qw_count,
302 const size_t entry_count) {
305 const uint64_t old = atomicCAS(
306 reinterpret_cast<unsigned long long*>(groups_buffer + off),
EMPTY_KEY_64, *key);
308 for (
size_t i = 0; i < key_qw_count; ++i) {
309 groups_buffer[off] = key[i];
312 return &groups_buffer[off];
317 for (
size_t i = 0; i < key_qw_count; ++i) {
318 if (groups_buffer[off] != key[i]) {
323 return &groups_buffer[off];
332 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
333 unsigned long long int old = *address_as_ull, assumed;
337 old = atomicCAS(address_as_ull, assumed, max((
long long)val, (
long long)assumed));
338 }
while (assumed != old);
344 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
345 unsigned long long int old = *address_as_ull, assumed;
349 old = atomicCAS(address_as_ull, assumed, min((
long long)val, (
long long)assumed));
350 }
while (assumed != old);
355 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600)
356 __device__
double atomicAdd(
double* address,
double val) {
357 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
358 unsigned long long int old = *address_as_ull, assumed;
362 old = atomicCAS(address_as_ull,
364 __double_as_longlong(val + __longlong_as_double(assumed)));
367 }
while (assumed != old);
369 return __longlong_as_double(old);
373 __device__
double atomicMax(
double* address,
double val) {
374 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
375 unsigned long long int old = *address_as_ull, assumed;
379 old = atomicCAS(address_as_ull,
381 __double_as_longlong(max(val, __longlong_as_double(assumed))));
384 }
while (assumed != old);
386 return __longlong_as_double(old);
390 int* address_as_int = (
int*)address;
391 int old = *address_as_int, assumed;
396 address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
399 }
while (assumed != old);
401 return __int_as_float(old);
404 __device__
double atomicMin(
double* address,
double val) {
405 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
406 unsigned long long int old = *address_as_ull, assumed;
410 old = atomicCAS(address_as_ull,
412 __double_as_longlong(min(val, __longlong_as_double(assumed))));
413 }
while (assumed != old);
415 return __longlong_as_double(old);
418 __device__
double atomicMin(
float* address,
float val) {
419 int* address_as_ull = (
int*)address;
420 int old = *address_as_ull, assumed;
425 address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
426 }
while (assumed != old);
428 return __int_as_float(old);
432 return static_cast<uint64_t
>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1UL));
436 return atomicAdd(agg, 1UL);
448 return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
452 return atomicAdd(agg, val);
456 atomicAdd(reinterpret_cast<float*>(agg), val);
460 atomicAdd(reinterpret_cast<double*>(agg), val);
472 atomicMax(reinterpret_cast<double*>(agg), val);
476 atomicMax(reinterpret_cast<float*>(agg), val);
487 #if CUDA_VERSION > 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
488 __device__
void atomicMax16(int16_t* agg,
const int16_t val) {
489 unsigned short int* address_as_us =
reinterpret_cast<unsigned short int*
>(agg);
490 unsigned short int old = *address_as_us, assumed;
494 old = atomicCAS(address_as_us,
496 static_cast<unsigned short>(max(static_cast<short int>(val),
497 static_cast<short int>(assumed))));
498 }
while (assumed != old);
503 unsigned int* base_address_u32 =
504 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
506 unsigned int old_value = *base_address_u32;
507 unsigned int swap_value, compare_value;
509 compare_value = old_value;
511 (
reinterpret_cast<size_t>(agg) & 0x2)
512 ?
static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
515 : (old_value & 0xFFFF0000) |
516 static_cast<unsigned int>(
517 max(static_cast<int16_t>(old_value & 0xFFFF), val));
518 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
519 }
while (old_value != compare_value);
525 unsigned int* base_address_u32 =
526 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
533 constexpr
unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
534 unsigned int old_value = *base_address_u32;
535 unsigned int swap_value, compare_value;
537 compare_value = old_value;
538 auto max_value =
static_cast<unsigned int>(
541 static_cast<int8_t>(__byte_perm(
542 compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
543 swap_value = __byte_perm(
544 compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
545 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
546 }
while (compare_value != old_value);
549 #if CUDA_VERSION > 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
550 __device__
void atomicMin16(int16_t* agg,
const int16_t val) {
551 unsigned short int* address_as_us =
reinterpret_cast<unsigned short int*
>(agg);
552 unsigned short int old = *address_as_us, assumed;
556 old = atomicCAS(address_as_us,
558 static_cast<unsigned short>(min(static_cast<short int>(val),
559 static_cast<short int>(assumed))));
560 }
while (assumed != old);
565 unsigned int* base_address_u32 =
566 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
568 unsigned int old_value = *base_address_u32;
569 unsigned int swap_value, compare_value;
571 compare_value = old_value;
573 (
reinterpret_cast<size_t>(agg) & 0x2)
574 ?
static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
577 : (old_value & 0xFFFF0000) |
578 static_cast<unsigned int>(
579 min(static_cast<int16_t>(old_value & 0xFFFF), val));
580 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
581 }
while (old_value != compare_value);
587 const int16_t skip_val) {
589 unsigned int* base_address_u32 =
590 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
592 unsigned int old_value = *base_address_u32;
593 unsigned int swap_value, compare_value;
595 compare_value = old_value;
596 int16_t selected_old_val = (
reinterpret_cast<size_t>(agg) & 0x2)
597 ?
static_cast<int16_t
>(old_value >> 16)
598 : static_cast<int16_t>(old_value & 0xFFFF);
601 (
reinterpret_cast<size_t>(agg) & 0x2)
602 ?
static_cast<unsigned int>(
603 selected_old_val == skip_val ? val : min(selected_old_val, val))
606 : (old_value & 0xFFFF0000) |
607 static_cast<unsigned int>(
608 selected_old_val == skip_val ? val : min(selected_old_val, val));
609 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
610 }
while (old_value != compare_value);
615 unsigned int* base_address_u32 =
616 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
618 constexpr
unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
619 unsigned int old_value = *base_address_u32;
620 unsigned int swap_value, compare_value;
622 compare_value = old_value;
623 auto min_value =
static_cast<unsigned int>(
625 static_cast<int8_t>(__byte_perm(
626 compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
627 swap_value = __byte_perm(
628 compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
629 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
630 }
while (compare_value != old_value);
635 unsigned int* base_address_u32 =
636 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(agg) & ~0x3);
638 constexpr
unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
639 unsigned int old_value = *base_address_u32;
640 unsigned int swap_value, compare_value;
642 compare_value = old_value;
643 int8_t selected_old_val =
static_cast<int8_t
>(
644 __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
645 auto min_value =
static_cast<unsigned int>(
646 selected_old_val == skip_val ? val : min(val, selected_old_val));
647 swap_value = __byte_perm(
648 compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
649 old_value = atomicCAS(base_address_u32, compare_value, swap_value);
650 }
while (compare_value != old_value);
670 atomicMin(reinterpret_cast<double*>(agg), val);
674 atomicMin(reinterpret_cast<float*>(agg), val);
683 const int64_t null_val) {
684 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(agg);
685 unsigned long long int old = *address_as_ull, assumed;
687 if (val == null_val) {
692 if (static_cast<int64_t>(old) != null_val) {
693 if (static_cast<int64_t>(old) != val) {
702 old = atomicCAS(address_as_ull, assumed, val);
703 }
while (assumed != old);
708 #define DEF_AGG_ID_INT_SHARED(n) \
709 extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg, \
710 const int##n##_t val) { \
718 #undef DEF_AGG_ID_INT_SHARED
721 *agg = *(
reinterpret_cast<const int64_t*
>(&val));
726 const double null_val) {
727 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(agg);
728 unsigned long long int old = *address_as_ull, assumed;
730 if (val == null_val) {
735 if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
736 if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
745 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
746 }
while (assumed != old);
752 *agg = *(
reinterpret_cast<const int64_t*
>(val));
755 extern "C" __device__ int32_t
758 const double null_val) {
759 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(agg);
760 unsigned long long int old = *address_as_ull, assumed;
763 if (val == null_val) {
768 if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
769 if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
778 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
779 }
while (assumed != old);
785 *agg = __float_as_int(val);
790 const float null_val) {
791 int* address_as_ull =
reinterpret_cast<int*
>(agg);
792 int old = *address_as_ull, assumed;
794 if (val == null_val) {
799 if (old != __float_as_int(null_val)) {
800 if (old != __float_as_int(val)) {
809 old = atomicCAS(address_as_ull, assumed, __float_as_int(val));
810 }
while (assumed != old);
815 #define DEF_SKIP_AGG(base_agg_func) \
816 extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
817 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
818 if (val != skip_val) { \
819 return base_agg_func##_shared(agg, val); \
824 #define DATA_T int64_t
825 #define ADDR_T uint64_t
830 #define DATA_T int32_t
831 #define ADDR_T uint32_t
839 const int32_t skip_val) {
840 if (val != skip_val) {
847 const int16_t skip_val) {
848 if (val != skip_val) {
855 const int16_t skip_val) {
856 if (val != skip_val) {
863 const int8_t skip_val) {
864 if (val != skip_val) {
871 const int8_t skip_val) {
872 if (val != skip_val) {
879 const int32_t skip_val) {
880 int32_t old = atomicExch(address, INT_MAX);
881 return atomicMin(address, old == skip_val ? val : min(old, val));
886 const int32_t skip_val) {
887 if (val != skip_val) {
894 const int32_t skip_val) {
895 unsigned int* address_as_int = (
unsigned int*)address;
896 int32_t old = atomicExch(address_as_int, 0);
897 int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
898 return old == skip_val ? old2 : (old2 + old);
903 const int32_t skip_val) {
904 if (val != skip_val) {
913 const int64_t skip_val) {
914 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
915 int64_t old = atomicExch(address_as_ull, 0);
916 int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
917 return old == skip_val ? old2 : (old2 + old);
922 const int64_t skip_val) {
923 if (val != skip_val) {
931 const int64_t skip_val) {
932 unsigned long long int* address_as_ull =
933 reinterpret_cast<unsigned long long int*
>(address);
934 unsigned long long int old = *address_as_ull, assumed;
938 old = atomicCAS(address_as_ull,
940 assumed == skip_val ? val : min((
long long)val, (
long long)assumed));
941 }
while (assumed != old);
948 const int64_t skip_val) {
949 if (val != skip_val) {
956 const int64_t skip_val) {
957 unsigned long long int* address_as_ull =
958 reinterpret_cast<unsigned long long int*
>(address);
959 unsigned long long int old = *address_as_ull, assumed;
963 old = atomicCAS(address_as_ull,
965 assumed == skip_val ? val : max((
long long)val, (
long long)assumed));
966 }
while (assumed != old);
973 const int64_t skip_val) {
974 if (val != skip_val) {
980 #define DEF_SKIP_AGG(base_agg_func) \
981 extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
982 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
983 if (val != skip_val) { \
984 return base_agg_func##_shared(agg, val); \
989 #define DATA_T double
990 #define ADDR_T uint64_t
996 #define ADDR_T uint32_t
1004 const float skip_val) {
1005 if (__float_as_int(val) != __float_as_int(skip_val)) {
1006 float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
1007 atomicMax(reinterpret_cast<float*>(agg),
1008 __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
1013 float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
1015 reinterpret_cast<float*>(address),
1016 __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
1021 const float skip_val) {
1022 if (__float_as_int(val) != __float_as_int(skip_val)) {
1029 const float skip_val) {
1030 float old = atomicExch(address, 0.f);
1031 atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
1036 const float skip_val) {
1037 if (__float_as_int(val) != __float_as_int(skip_val)) {
1044 const double skip_val) {
1045 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
1046 double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
1049 __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
1054 const double skip_val) {
1055 if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1062 const double skip_val) {
1063 unsigned long long int* address_as_ull =
1064 reinterpret_cast<unsigned long long int*
>(address);
1065 unsigned long long int old = *address_as_ull;
1066 unsigned long long int skip_val_as_ull =
1067 *
reinterpret_cast<const unsigned long long*
>(&skip_val);
1068 unsigned long long int assumed;
1072 old = atomicCAS(address_as_ull,
1074 assumed == skip_val_as_ull
1075 ? *reinterpret_cast<unsigned long long*>(&val)
1076 : __double_as_longlong(min(val, __longlong_as_double(assumed))));
1077 }
while (assumed != old);
1079 return __longlong_as_double(old);
1084 const double skip_val) {
1085 if (val != skip_val) {
1092 const double skip_val) {
1093 if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1094 double old = __longlong_as_double(atomicExch(
1095 reinterpret_cast<unsigned long long int*>(agg), __double_as_longlong(-DBL_MAX)));
1096 atomicMax(reinterpret_cast<double*>(agg),
1097 __double_as_longlong(old) == __double_as_longlong(skip_val)
1108 auto slot_address =
reinterpret_cast<unsigned long long int*
>(slot);
1109 const auto empty_key =
1110 static_cast<unsigned long long int*
>(
static_cast<void*
>(&init_val));
1111 const auto new_val_cast =
1112 static_cast<unsigned long long int*
>(
static_cast<void*
>(&new_val));
1114 const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
1115 if (old_val == *empty_key) {
1125 unsigned int* slot_address =
reinterpret_cast<unsigned int*
>(slot);
1126 unsigned int compare_value =
static_cast<unsigned int>(init_val);
1127 unsigned int swap_value =
static_cast<unsigned int>(new_val);
1129 const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
1130 return old_value == compare_value;
1136 unsigned int* base_slot_address =
1137 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(slot) & ~0x3);
1138 unsigned int old_value = *base_slot_address;
1139 unsigned int swap_value, compare_value;
1141 compare_value = old_value;
1144 if (static_cast<unsigned int>(init_val) !=
1146 compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
1149 swap_value = __byte_perm(compare_value,
1150 static_cast<unsigned int>(new_val),
1151 (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
1152 old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1153 }
while (compare_value != old_value);
1161 unsigned int* base_slot_address =
1162 reinterpret_cast<unsigned int*
>(
reinterpret_cast<size_t>(slot) & ~0x3);
1163 constexpr
unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
1164 unsigned int old_value = *base_slot_address;
1165 unsigned int swap_value, compare_value;
1167 compare_value = old_value;
1170 if (static_cast<unsigned int>(init_val) !=
1171 __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
1174 swap_value = __byte_perm(compare_value,
1175 static_cast<unsigned int>(new_val),
1176 byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
1177 old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1178 }
while (compare_value != old_value);
1182 #include "../Utils/ChunkIter.cpp"
1185 #define EXECUTE_INCLUDE
1189 #undef EXECUTE_INCLUDE
1190 #include "../Utils/Regexp.cpp"
1191 #include "../Utils/StringLike.cpp"
1193 extern "C" __device__ uint64_t
string_decode(int8_t* chunk_iter_, int64_t pos) {
1200 : (
reinterpret_cast<uint64_t
>(vd.
pointer) & 0xffffffffffff) |
1201 (
static_cast<uint64_t
>(vd.
length) << 48);
1205 const uint32_t bitmap_bytes,
1206 const uint8_t* key_bytes,
1207 const uint32_t key_len) {
1208 const uint32_t bit_pos =
MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1209 const uint32_t word_idx = bit_pos / 32;
1210 const uint32_t bit_idx = bit_pos % 32;
1211 atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
1216 const int64_t min_val,
1217 const int64_t base_dev_addr,
1218 const int64_t base_host_addr,
1219 const uint64_t sub_bitmap_count,
1220 const uint64_t bitmap_bytes) {
1221 const uint64_t bitmap_idx = val - min_val;
1222 const uint32_t byte_idx = bitmap_idx >> 3;
1223 const uint32_t word_idx = byte_idx >> 2;
1224 const uint32_t byte_word_idx = byte_idx & 3;
1225 const int64_t host_addr = *agg;
1226 uint32_t* bitmap = (uint32_t*)(base_dev_addr + host_addr - base_host_addr +
1227 (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes);
1228 switch (byte_word_idx) {
1230 atomicOr(&bitmap[word_idx], 1 << (bitmap_idx & 7));
1233 atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 8));
1236 atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 16));
1239 atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 24));
1249 const int64_t min_val,
1250 const int64_t skip_val,
1251 const int64_t base_dev_addr,
1252 const int64_t base_host_addr,
1253 const uint64_t sub_bitmap_count,
1254 const uint64_t bitmap_bytes) {
1255 if (val != skip_val) {
1257 agg, val, min_val, base_dev_addr, base_host_addr, sub_bitmap_count, bitmap_bytes);
1265 const int64_t base_dev_addr,
1266 const int64_t base_host_addr) {
1268 const uint32_t index = hash >> (64 - b);
1269 const int32_t rank =
get_rank(hash << b, 64 - b);
1270 const int64_t host_addr = *agg;
1271 int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
1276 __threadfence_block();
1293 if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
1311 int64_t* output_buffer,
1312 const int32_t agg_idx) {
1313 if (threadIdx.x == agg_idx) {
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
__device__ int32_t checked_single_agg_id_double_shared_slow(int64_t *agg, const double *valp, const double null_val)
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
__device__ void agg_max_float_shared(int32_t *agg, const float val)
__device__ uint32_t agg_count_float_shared(uint32_t *agg, const float val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
__device__ bool dynamic_watchdog()
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
__device__ void agg_max_shared(int64_t *agg, const int64_t val)
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
__device__ void agg_min_int32_shared(int32_t *agg, const int32_t val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
__device__ int64_t dw_sm_cycle_start[128]
__device__ void agg_id_float_shared(int32_t *agg, const float val)
__device__ void agg_min_double_shared(int64_t *agg, const double val)
__device__ int64_t get_thread_index()
__device__ int32_t atomicMin32SkipVal(int32_t *address, int32_t val, const int32_t skip_val)
__device__ int32_t pos_step_impl()
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
__device__ void agg_min_int8_shared(int8_t *agg, const int8_t val)
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ float atomicMinFltSkipVal(int32_t *address, float val, const float skip_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
__device__ double atomicMin(double *address, double val)
__device__ void agg_max_int8_shared(int8_t *agg, const int8_t val)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
__device__ void atomicMin8SkipVal(int8_t *agg, const int8_t val, const int8_t skip_val)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
__device__ uint32_t agg_count_int32_shared(uint32_t *agg, const int32_t val)
__device__ int64_t dw_cycle_budget
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
__device__ void agg_min_float_shared(int32_t *agg, const float val)
__device__ int64_t atomicMin64(int64_t *address, int64_t val)
__device__ int64_t * declare_dynamic_shared_memory()
__device__ void agg_max_double_shared(int64_t *agg, const double val)
__device__ void atomicSumDblSkipVal(double *address, const double val, const double skip_val)
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
__device__ void agg_id_double_shared(int64_t *agg, const double val)
__device__ void agg_max_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void atomicMax16(int16_t *agg, const int16_t val)
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
__device__ void agg_min_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
__device__ bool check_interrupt()
__device__ bool slotEmptyKeyCAS_int32(int32_t *slot, int32_t new_val, int32_t init_val)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
__device__ int64_t atomicSum64SkipVal(int64_t *address, const int64_t val, const int64_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ void agg_min_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
__device__ void atomicSumFltSkipVal(float *address, const float val, const float skip_val)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
__inline__ __device__ uint32_t get_smid(void)
__device__ void agg_min_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ uint64_t agg_count_shared(uint64_t *agg, const int64_t val)
__device__ int64_t atomicMax64(int64_t *address, int64_t val)
__device__ bool slotEmptyKeyCAS(int64_t *slot, int64_t new_val, int64_t init_val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
__device__ int64_t atomicMax64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)
__device__ void atomicMin16(int16_t *agg, const int16_t val)
__device__ void agg_max_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
__device__ int32_t runtime_interrupt_flag
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
__device__ void sync_warp()
__device__ void atomicMin16SkipVal(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void atomicMin8(int8_t *agg, const int8_t val)
__device__ void agg_min_int16_shared(int16_t *agg, const int16_t val)
__device__ void agg_max_int16_shared(int16_t *agg, const int16_t val)
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
__device__ void agg_min_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
#define DEF_AGG_ID_INT_SHARED(n)
__device__ uint64_t agg_count_double_shared(uint64_t *agg, const double val)
__device__ T get_empty_key()
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void sync_threadblock()
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
__device__ void atomicMax8(int8_t *agg, const int8_t val)
__device__ void agg_id_shared(int64_t *agg, const int64_t val)
__device__ double atomicMax(double *address, double val)
__device__ uint64_t string_decode(int8_t *chunk_iter_, int64_t pos)
__device__ int32_t atomicSum32SkipVal(int32_t *address, const int32_t val, const int32_t skip_val)
__device__ double atomicMinDblSkipVal(double *address, double val, const double skip_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ void agg_max_int32_shared(int32_t *agg, const int32_t val)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
__device__ void agg_max_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t dw_abort
__device__ bool slotEmptyKeyCAS_int16(int16_t *slot, int16_t new_val, int16_t init_val)
__device__ void agg_max_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
__device__ int64_t atomicMin64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)
Functions to support array operations used by the executor.
__device__ void force_sync()
__device__ void agg_min_shared(int64_t *agg, const int64_t val)
__device__ bool slotEmptyKeyCAS_int8(int8_t *slot, int8_t new_val, int8_t init_val)
__device__ int32_t group_buff_idx_impl()