18 #error This code is not intended to be compiled with a CUDA C++ compiler
22 #include "../Shared/funcannotations.h"
44 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
45 extern "C" ALWAYS_INLINE type opname##_##type##_nullable( \
46 const type lhs, const type rhs, const null_type null_val) { \
47 if (lhs != null_val && rhs != null_val) { \
48 return lhs opsym rhs; \
53 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
54 extern "C" ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
55 const type lhs, const type rhs, const null_type null_val) { \
56 if (lhs != null_val) { \
57 return lhs opsym rhs; \
62 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
63 extern "C" ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
64 const type lhs, const type rhs, const null_type null_val) { \
65 if (rhs != null_val) { \
66 return lhs opsym rhs; \
71 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
72 extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable( \
75 const null_type null_val, \
76 const int8_t null_bool_val) { \
77 if (lhs != null_val && rhs != null_val) { \
78 return lhs opsym rhs; \
80 return null_bool_val; \
83 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
84 extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
87 const null_type null_val, \
88 const int8_t null_bool_val) { \
89 if (lhs != null_val) { \
90 return lhs opsym rhs; \
92 return null_bool_val; \
95 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
96 extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
99 const null_type null_val, \
100 const int8_t null_bool_val) { \
101 if (rhs != null_val) { \
102 return lhs opsym rhs; \
104 return null_bool_val; \
107 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
108 extern "C" ALWAYS_INLINE type safe_div_##type( \
109 const type lhs, const type rhs, const null_type null_val) { \
110 if (lhs != null_val && rhs != null_val && rhs != 0) { \
116 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
117 DEF_ARITH_NULLABLE(type, null_type, add, +) \
118 DEF_ARITH_NULLABLE(type, null_type, sub, -) \
119 DEF_ARITH_NULLABLE(type, null_type, mul, *) \
120 DEF_ARITH_NULLABLE(type, null_type, div, /) \
121 DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
122 DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
123 DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
124 DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
125 DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
126 DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
127 DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
128 DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
129 DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
130 DEF_CMP_NULLABLE(type, null_type, eq, ==) \
131 DEF_CMP_NULLABLE(type, null_type, ne, !=) \
132 DEF_CMP_NULLABLE(type, null_type, lt, <) \
133 DEF_CMP_NULLABLE(type, null_type, gt, >) \
134 DEF_CMP_NULLABLE(type, null_type, le, <=) \
135 DEF_CMP_NULLABLE(type, null_type, ge, >=) \
136 DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
137 DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
138 DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
139 DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
140 DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
141 DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
142 DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
143 DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
144 DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
145 DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
146 DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
147 DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
168 #undef DEF_BINARY_NULLABLE_ALL_OPS
169 #undef DEF_SAFE_DIV_NULLABLE
170 #undef DEF_CMP_NULLABLE_RHS
171 #undef DEF_CMP_NULLABLE_LHS
172 #undef DEF_CMP_NULLABLE
173 #undef DEF_ARITH_NULLABLE_RHS
174 #undef DEF_ARITH_NULLABLE_LHS
175 #undef DEF_ARITH_NULLABLE
178 const uint64_t scale,
179 const int64_t operand_null_val,
180 const int64_t result_null_val) {
181 return operand != operand_null_val ? operand * scale : result_null_val;
186 const int64_t null_val) {
188 if (operand == null_val) {
192 int64_t tmp = scale >> 1;
193 tmp = operand >= 0 ? operand + tmp : operand - tmp;
199 const int64_t null_val) {
200 int64_t tmp = scale >> 1;
201 tmp = operand >= 0 ? operand + tmp : operand - tmp;
208 const int64_t divisor) {
209 return (dividend < 0 ? dividend - (divisor - 1) : dividend) / divisor;
215 const int64_t divisor,
216 const int64_t null_val) {
217 return dividend == null_val ? null_val :
floor_div_lhs(dividend, divisor);
220 #define DEF_UMINUS_NULLABLE(type, null_type) \
221 extern "C" ALWAYS_INLINE type uminus_##type##_nullable(const type operand, \
222 const null_type null_val) { \
223 return operand == null_val ? null_val : -operand; \
233 #undef DEF_UMINUS_NULLABLE
235 #define DEF_CAST_NULLABLE(from_type, to_type) \
236 extern "C" ALWAYS_INLINE to_type cast_##from_type##_to_##to_type##_nullable( \
237 const from_type operand, \
238 const from_type from_null_val, \
239 const to_type to_null_val) { \
240 return operand == from_null_val ? to_null_val : operand; \
243 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
244 DEF_CAST_NULLABLE(type1, type2) \
245 DEF_CAST_NULLABLE(type2, type1)
265 #undef DEF_CAST_NULLABLE_BIDIR
266 #undef DEF_CAST_NULLABLE
269 return operand == null_val ? operand : (operand ? 0 : 1);
274 const int8_t null_val) {
275 if (lhs == null_val) {
276 return rhs == 0 ? rhs : null_val;
278 if (rhs == null_val) {
279 return lhs == 0 ? lhs : null_val;
281 return (lhs && rhs) ? 1 : 0;
286 const int8_t null_val) {
287 if (lhs == null_val) {
288 return rhs == 0 ? null_val : rhs;
290 if (rhs == null_val) {
291 return lhs == 0 ? null_val : lhs;
293 return (lhs || rhs) ? 1 : 0;
304 const int64_t min_val) {
305 const uint64_t bitmap_idx = val - min_val;
306 reinterpret_cast<int8_t*
>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
310 #define GPU_RT_STUB NEVER_INLINE
312 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
327 const uint32_t index = hash >> (64 - b);
328 const uint8_t rank =
get_rank(hash << b, 64 - b);
329 uint8_t* M =
reinterpret_cast<uint8_t*
>(*agg);
330 M[index] = std::max(M[index], rank);
341 const int64_t min_val,
342 const int64_t max_val,
343 const int64_t null_val,
344 const int8_t null_bool_val) {
345 if (val == null_val) {
346 return null_bool_val;
348 if (val < min_val || val > max_val) {
354 const uint64_t bitmap_idx = val - min_val;
355 return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
356 (1 << (bitmap_idx & 7))
362 const auto old = *agg;
368 *agg = std::max(*agg, val);
372 *agg = std::min(*agg, val);
381 const int64_t null_val) {
382 if (val == null_val) {
388 }
else if (*agg == null_val) {
399 const int64_t min_val,
400 const int64_t skip_val) {
401 if (val != skip_val) {
420 const auto old = *agg;
425 #define DEF_AGG_MAX_INT(n) \
426 extern "C" ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, const int##n##_t val) { \
427 *agg = std::max(*agg, val); \
433 #undef DEF_AGG_MAX_INT
435 #define DEF_AGG_MIN_INT(n) \
436 extern "C" ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, const int##n##_t val) { \
437 *agg = std::min(*agg, val); \
443 #undef DEF_AGG_MIN_INT
445 #define DEF_AGG_ID_INT(n) \
446 extern "C" ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, const int##n##_t val) { \
450 #define DEF_CHECKED_SINGLE_AGG_ID_INT(n) \
451 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_int##n( \
452 int##n##_t* agg, const int##n##_t val, const int##n##_t null_val) { \
453 if (val == null_val) { \
458 } else if (*agg == null_val) { \
475 #undef DEF_AGG_ID_INT
476 #undef DEF_CHECKED_SINGLE_AGG_ID_INT
478 #define DEF_WRITE_PROJECTION_INT(n) \
479 extern "C" ALWAYS_INLINE void write_projection_int##n( \
480 int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
481 if (val != init_val) { \
482 *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
488 #undef DEF_WRITE_PROJECTION_INT
492 const int64_t skip_val) {
493 const auto old = *agg;
494 if (val != skip_val) {
495 if (old != skip_val) {
506 const int32_t skip_val) {
507 const auto old = *agg;
508 if (val != skip_val) {
509 if (old != skip_val) {
520 const int64_t skip_val) {
521 if (val != skip_val) {
529 const int32_t skip_val) {
530 if (val != skip_val) {
536 #define DEF_SKIP_AGG_ADD(base_agg_func) \
537 extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
538 DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
539 if (val != skip_val) { \
540 base_agg_func(agg, val); \
544 #define DEF_SKIP_AGG(base_agg_func) \
545 extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
546 DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
547 if (val != skip_val) { \
548 const DATA_T old_agg = *agg; \
549 if (old_agg != skip_val) { \
550 base_agg_func(agg, val); \
557 #define DATA_T int64_t
562 #define DATA_T int32_t
567 #define DATA_T int16_t
572 #define DATA_T int8_t
577 #undef DEF_SKIP_AGG_ADD
587 const auto r = *
reinterpret_cast<const double*
>(agg) + val;
588 *agg = *
reinterpret_cast<const int64_t*
>(may_alias_ptr(&
r));
592 const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
593 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&
r)));
597 const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
598 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&
r)));
602 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&val)));
607 const double null_val) {
608 if (val == null_val) {
612 if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)))) {
614 }
else if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&null_val)))) {
615 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&val)));
628 const auto r = *
reinterpret_cast<const float*
>(agg) + val;
629 *agg = *
reinterpret_cast<const int32_t*
>(may_alias_ptr(&
r));
633 const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
634 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&
r)));
638 const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
639 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&
r)));
643 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&val)));
648 const float null_val) {
649 if (val == null_val) {
653 if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)))) {
655 }
else if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&null_val)))) {
656 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&val)));
666 const double skip_val) {
667 if (val != skip_val) {
675 const float skip_val) {
676 if (val != skip_val) {
682 #define DEF_SKIP_AGG_ADD(base_agg_func) \
683 extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
684 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
685 if (val != skip_val) { \
686 base_agg_func(agg, val); \
690 #define DEF_SKIP_AGG(base_agg_func) \
691 extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
692 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
693 if (val != skip_val) { \
694 const ADDR_T old_agg = *agg; \
695 if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
696 base_agg_func(agg, val); \
698 *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
703 #define DATA_T double
704 #define ADDR_T int64_t
712 #define ADDR_T int32_t
719 #undef DEF_SKIP_AGG_ADD
724 return x / scale * scale;
729 return x / scale * scale - scale;
738 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
739 extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
740 const int64_t val) { \
744 extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
745 uint64_t* agg, const int64_t val, const int64_t skip_val) { \
748 extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
749 const int32_t val) { \
753 extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
754 uint32_t* agg, const int32_t val, const int32_t skip_val) { \
758 extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
759 const double val) { \
763 extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
764 uint64_t* agg, const double val, const double skip_val) { \
767 extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
772 extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
773 uint32_t* agg, const float val, const float skip_val) { \
777 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
778 extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
780 extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
781 int64_t* agg, const int64_t val, const int64_t skip_val) {} \
782 extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
783 const int32_t val) {} \
784 extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
785 const int16_t val) {} \
786 extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
787 const int8_t val) {} \
789 extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
790 int32_t* agg, const int32_t val, const int32_t skip_val) {} \
792 extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
793 const double val) {} \
795 extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
796 int64_t* agg, const double val, const double skip_val) {} \
797 extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
798 const float val) {} \
800 extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
801 int32_t* agg, const float val, const float skip_val) {}
810 const int64_t null_val) {
817 const int32_t null_val) {
823 const int16_t null_val) {
828 const int8_t null_val) {
835 const double null_val) {
841 const float null_val) {
847 const int16_t skip_val) {}
851 const int8_t skip_val) {}
855 const int16_t skip_val) {}
859 const int8_t skip_val) {}
869 const int64_t skip_val) {
878 const int32_t skip_val) {
886 const double skip_val) {}
891 const float skip_val) {}
900 int64_t* output_buffer,
901 const int32_t num_agg_cols){};
905 int32_t row_index_resume{0};
907 row_index_resume = error_code[0];
910 return row_index_resume;
940 int32_t* error_codes) {
959 const int64_t* groups_buffer,
960 const int32_t groups_buffer_size) {
961 return groups_buffer;
974 const int32_t groups_buffer_size) {
979 int64_t* groups_buffer,
980 const int64_t* init_vals,
981 const uint32_t groups_buffer_entry_count,
982 const uint32_t key_qw_count,
983 const uint32_t agg_col_count,
985 const int8_t warp_size) {
988 assert(groups_buffer);
993 int64_t* groups_buffer,
994 const int64_t* init_vals,
995 const uint32_t groups_buffer_entry_count,
996 const uint32_t key_qw_count,
997 const uint32_t agg_col_count,
999 const bool blocks_share_memory,
1000 const int32_t frag_idx) {
1003 assert(groups_buffer);
1008 int64_t* groups_buffer,
1009 const int64_t* init_vals,
1010 const uint32_t groups_buffer_entry_count,
1011 const uint32_t key_qw_count,
1012 const uint32_t agg_col_count,
1014 const int8_t warp_size) {
1017 assert(groups_buffer);
1021 template <
typename T>
1025 const uint32_t key_count,
1026 const uint32_t row_size_quad) {
1027 auto off = h * row_size_quad;
1028 auto row_ptr =
reinterpret_cast<T*
>(groups_buffer + off);
1029 if (*row_ptr == get_empty_key<T>()) {
1030 memcpy(row_ptr, key, key_count *
sizeof(
T));
1031 auto row_ptr_i8 =
reinterpret_cast<int8_t*
>(row_ptr + key_count);
1034 if (memcmp(row_ptr, key, key_count *
sizeof(
T)) == 0) {
1035 auto row_ptr_i8 =
reinterpret_cast<int8_t*
>(row_ptr + key_count);
1044 const uint32_t key_count,
1045 const uint32_t key_width,
1046 const uint32_t row_size_quad) {
1047 switch (key_width) {
1051 reinterpret_cast<const int32_t*>(key),
1061 template <
typename T>
1063 const uint32_t entry_count,
1066 const uint32_t key_count) {
1068 auto key_buffer =
reinterpret_cast<T*
>(groups_buffer);
1069 if (key_buffer[off] == get_empty_key<T>()) {
1070 for (
size_t i = 0;
i < key_count; ++
i) {
1071 key_buffer[off] = key[
i];
1077 for (
size_t i = 0;
i < key_count; ++
i) {
1078 if (key_buffer[off] != key[
i]) {
1088 const uint32_t entry_count,
1091 const uint32_t key_count,
1092 const uint32_t key_width) {
1093 switch (key_width) {
1098 reinterpret_cast<const int32_t*>(key),
1102 groups_buffer, entry_count, h, key, key_count);
1110 int64_t* groups_buffer,
1113 const uint32_t key_qw_count,
1114 const size_t entry_count) {
1117 for (
size_t i = 0;
i < key_qw_count; ++
i) {
1118 groups_buffer[off] = key[
i];
1121 return &groups_buffer[off];
1124 for (
size_t i = 0;
i < key_qw_count; ++
i) {
1125 if (groups_buffer[off] != key[
i]) {
1130 return &groups_buffer[off];
1145 int64_t* groups_buffer,
1146 const uint32_t hashed_index,
1148 const uint32_t key_count,
1149 const uint32_t row_size_quad) {
1150 uint32_t off = hashed_index * row_size_quad;
1152 for (uint32_t
i = 0;
i < key_count; ++
i) {
1153 groups_buffer[off +
i] = key[
i];
1156 return groups_buffer + off + key_count;
1166 int64_t* groups_buffer,
1167 const uint32_t hashed_index,
1168 const uint32_t row_size_quad) {
1169 return groups_buffer + row_size_quad * hashed_index;
1177 int64_t* groups_buffer,
1178 const uint32_t hashed_index,
1180 const uint32_t key_count,
1181 const uint32_t entry_count) {
1183 for (uint32_t
i = 0;
i < key_count;
i++) {
1184 groups_buffer[
i * entry_count + hashed_index] = key[
i];
1193 int64_t* groups_buffer,
1195 const int64_t min_key,
1197 const uint32_t row_size_quad) {
1198 return groups_buffer + row_size_quad * (key - min_key);
1202 int64_t* groups_buffer,
1204 const int64_t min_key,
1206 const uint32_t row_size_quad,
1208 const uint8_t warp_size) {
1209 return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1213 return reinterpret_cast<int8_t*
>(str_and_len & 0xffffffffffff);
1217 return static_cast<int64_t
>(str_and_len) >> 48;
1229 return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1230 (
static_cast<const uint64_t
>(len) << 48);
1234 #include "../Utils/StringLike.cpp"
1242 const int32_t str_len) {
1247 const int32_t str_len,
1248 const int32_t int_null) {
1260 const int64_t row_offset) {
1261 const int64_t threshold = 4294967296 * proportion;
1262 return (row_offset * 2654435761) % 4294967296 < threshold;
1266 const int64_t pos) {
1267 return reinterpret_cast<const int64_t*
>(output_buff)[pos];
1271 const int64_t pos) {
1272 return reinterpret_cast<const double*
>(output_buff)[pos];
1276 return *
reinterpret_cast<const double*
>(may_alias_ptr(agg));
1280 return *
reinterpret_cast<const float*
>(may_alias_ptr(agg));
1284 const int64_t*
count,
1285 const double null_val) {
1286 return *count != 0 ?
static_cast<double>(*sum) / *count : null_val;
1290 const int64_t*
count,
1291 const double null_val,
1292 const uint32_t scale) {
1293 return *count != 0 ? (
static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
1297 const int64_t*
count,
1298 const double null_val) {
1299 return *count != 0 ? *
reinterpret_cast<const double*
>(may_alias_ptr(agg)) / *count
1304 const int32_t*
count,
1305 const double null_val) {
1306 return *count != 0 ? *
reinterpret_cast<const float*
>(may_alias_ptr(agg)) / *count
1311 const uint32_t bitmap_bytes,
1312 const uint8_t* key_bytes,
1313 const uint32_t key_len) {
1314 const uint32_t bit_pos =
MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1315 const uint32_t word_idx = bit_pos / 32;
1316 const uint32_t bit_idx = bit_pos % 32;
1317 reinterpret_cast<uint32_t*
>(bitmap)[word_idx] |= 1 << bit_idx;
1321 const int8_t* literals,
1322 const int64_t* num_rows,
1323 const uint64_t* frag_row_offsets,
1324 const int32_t* max_matched,
1325 const int64_t* init_agg_value,
1328 const int64_t* join_hash_tables,
1329 int32_t* error_code,
1330 int32_t* total_matched) {
1332 assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
1333 init_agg_value || out || frag_idx || error_code || join_hash_tables ||
1339 const uint64_t* num_fragments,
1340 const int8_t* literals,
1341 const int64_t* num_rows,
1342 const uint64_t* frag_row_offsets,
1343 const int32_t* max_matched,
1344 int32_t* total_matched,
1345 const int64_t* init_agg_value,
1347 int32_t* error_code,
1348 const uint32_t* num_tables_ptr,
1349 const int64_t* join_hash_tables) {
1350 for (uint32_t
i = 0;
i < *num_fragments; ++
i) {
1353 &num_rows[i * (*num_tables_ptr)],
1354 &frag_row_offsets[i * (*num_tables_ptr)],
1366 const int64_t* num_rows,
1367 const uint64_t* frag_row_offsets,
1368 const int32_t* max_matched,
1369 const int64_t* init_agg_value,
1372 const int64_t* join_hash_tables,
1373 int32_t* error_code,
1374 int32_t* total_matched) {
1376 assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
1377 out || frag_idx || error_code || join_hash_tables || total_matched);
1382 const uint64_t* num_fragments,
1383 const int64_t* num_rows,
1384 const uint64_t* frag_row_offsets,
1385 const int32_t* max_matched,
1386 int32_t* total_matched,
1387 const int64_t* init_agg_value,
1389 int32_t* error_code,
1390 const uint32_t* num_tables_ptr,
1391 const int64_t* join_hash_tables) {
1392 for (uint32_t
i = 0;
i < *num_fragments; ++
i) {
1393 query_stub(col_buffers ? col_buffers[
i] :
nullptr,
1394 &num_rows[i * (*num_tables_ptr)],
1395 &frag_row_offsets[i * (*num_tables_ptr)],
1416 if (command == static_cast<unsigned>(
INT_CHECK)) {
1422 if (command == static_cast<unsigned>(
INT_ABORT)) {
1426 if (command == static_cast<unsigned>(
INT_RESET)) {
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
#define DEF_UMINUS_NULLABLE(type, null_type)
ALWAYS_INLINE void record_error_code(const int32_t err_code, int32_t *error_codes)
GPU_RT_STUB int32_t checked_single_agg_id_int32_shared(int32_t *agg, const int32_t val, const int32_t null_val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_CHECKED_SINGLE_AGG_ID_INT(n)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
void agg_min_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
#define DEF_CAST_NULLABLE(from_type, to_type)
ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t *agg, const double val, const double null_val)
__device__ int64_t get_thread_index()
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
__device__ int32_t pos_step_impl()
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
ALWAYS_INLINE double load_double(const int64_t *agg)
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
#define DEF_AGG_MAX_INT(n)
void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE int32_t checked_single_agg_id(int64_t *agg, const int64_t val, const int64_t null_val)
ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
void agg_max_int16(int16_t *agg, const int16_t val)
ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend, const int64_t divisor, const int64_t null_val)
void agg_min_int8(int8_t *agg, const int8_t val)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
__device__ int64_t * declare_dynamic_shared_memory()
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
NEVER_INLINE int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
__device__ bool check_interrupt()
#define DEF_WRITE_PROJECTION_INT(n)
ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t *agg, const int8_t val, const int8_t null_val)
ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash_keyless(int64_t *groups_buffer, const uint32_t hashed_index, const uint32_t row_size_quad)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
void agg_min_int16(int16_t *agg, const int16_t val)
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend, const int64_t divisor)
#define DEF_AGG_MIN_INT(n)
ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
__device__ int32_t runtime_interrupt_flag
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
__device__ void sync_warp()
ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
NEVER_INLINE int32_t extract_str_len_noinline(const uint64_t str_and_len)
ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t *agg, const float val, const float null_val)
NEVER_INLINE void query_stub(const int8_t **col_buffers, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, int64_t **out, uint32_t frag_idx, const int64_t *join_hash_tables, int32_t *error_code, int32_t *total_matched)
ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
NEVER_INLINE void init_group_by_buffer_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const int8_t warp_size)
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
void agg_max_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
GPU_RT_STUB int32_t checked_single_agg_id_int16_shared(int16_t *agg, const int16_t val, const int16_t null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
ALWAYS_INLINE int32_t get_error_code(int32_t *error_codes)
bool check_interrupt_init(unsigned command)
ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
ALWAYS_INLINE float load_float(const int32_t *agg)
NEVER_INLINE void query_stub_hoisted_literals(const int8_t **col_buffers, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, int64_t **out, uint32_t frag_idx, const int64_t *join_hash_tables, int32_t *error_code, int32_t *total_matched)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void sync_threadblock()
ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion, const int64_t row_offset)
void agg_max_int8(int8_t *agg, const int8_t val)
#define DEF_SHARED_AGG_STUBS(base_agg_func)
ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
#define DEF_AGG_ID_INT(n)
NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ void force_sync()
ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
NEVER_INLINE void init_columnar_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const bool blocks_share_memory, const int32_t frag_idx)
__device__ int32_t group_buff_idx_impl()