18 #error This code is not intended to be compiled with a CUDA C++ compiler
22 #include "../Shared/funcannotations.h"
41 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
42 extern "C" ALWAYS_INLINE type opname##_##type##_nullable( \
43 const type lhs, const type rhs, const null_type null_val) { \
44 if (lhs != null_val && rhs != null_val) { \
45 return lhs opsym rhs; \
50 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
51 extern "C" ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
52 const type lhs, const type rhs, const null_type null_val) { \
53 if (lhs != null_val) { \
54 return lhs opsym rhs; \
59 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
60 extern "C" ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
61 const type lhs, const type rhs, const null_type null_val) { \
62 if (rhs != null_val) { \
63 return lhs opsym rhs; \
68 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
69 extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable( \
72 const null_type null_val, \
73 const int8_t null_bool_val) { \
74 if (lhs != null_val && rhs != null_val) { \
75 return lhs opsym rhs; \
77 return null_bool_val; \
80 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
81 extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
84 const null_type null_val, \
85 const int8_t null_bool_val) { \
86 if (lhs != null_val) { \
87 return lhs opsym rhs; \
89 return null_bool_val; \
92 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
93 extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
96 const null_type null_val, \
97 const int8_t null_bool_val) { \
98 if (rhs != null_val) { \
99 return lhs opsym rhs; \
101 return null_bool_val; \
104 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
105 extern "C" ALWAYS_INLINE type safe_div_##type( \
106 const type lhs, const type rhs, const null_type null_val) { \
107 if (lhs != null_val && rhs != null_val && rhs != 0) { \
113 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
114 DEF_ARITH_NULLABLE(type, null_type, add, +) \
115 DEF_ARITH_NULLABLE(type, null_type, sub, -) \
116 DEF_ARITH_NULLABLE(type, null_type, mul, *) \
117 DEF_ARITH_NULLABLE(type, null_type, div, /) \
118 DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
119 DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
120 DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
121 DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
122 DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
123 DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
124 DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
125 DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
126 DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
127 DEF_CMP_NULLABLE(type, null_type, eq, ==) \
128 DEF_CMP_NULLABLE(type, null_type, ne, !=) \
129 DEF_CMP_NULLABLE(type, null_type, lt, <) \
130 DEF_CMP_NULLABLE(type, null_type, gt, >) \
131 DEF_CMP_NULLABLE(type, null_type, le, <=) \
132 DEF_CMP_NULLABLE(type, null_type, ge, >=) \
133 DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
134 DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
135 DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
136 DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
137 DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
138 DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
139 DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
140 DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
141 DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
142 DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
143 DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
144 DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
165 #undef DEF_BINARY_NULLABLE_ALL_OPS
166 #undef DEF_SAFE_DIV_NULLABLE
167 #undef DEF_CMP_NULLABLE_RHS
168 #undef DEF_CMP_NULLABLE_LHS
169 #undef DEF_CMP_NULLABLE
170 #undef DEF_ARITH_NULLABLE_RHS
171 #undef DEF_ARITH_NULLABLE_LHS
172 #undef DEF_ARITH_NULLABLE
175 const uint64_t scale,
176 const int64_t operand_null_val,
177 const int64_t result_null_val) {
178 return operand != operand_null_val ? operand * scale : result_null_val;
183 const int64_t null_val) {
185 if (operand == null_val) {
189 int64_t tmp = scale >> 1;
190 tmp = operand >= 0 ? operand + tmp : operand - tmp;
196 const int64_t null_val) {
197 int64_t tmp = scale >> 1;
198 tmp = operand >= 0 ? operand + tmp : operand - tmp;
202 #define DEF_UMINUS_NULLABLE(type, null_type) \
203 extern "C" ALWAYS_INLINE type uminus_##type##_nullable(const type operand, \
204 const null_type null_val) { \
205 return operand == null_val ? null_val : -operand; \
214 #undef DEF_UMINUS_NULLABLE
216 #define DEF_CAST_NULLABLE(from_type, to_type) \
217 extern "C" ALWAYS_INLINE to_type cast_##from_type##_to_##to_type##_nullable( \
218 const from_type operand, \
219 const from_type from_null_val, \
220 const to_type to_null_val) { \
221 return operand == from_null_val ? to_null_val : operand; \
224 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
225 DEF_CAST_NULLABLE(type1, type2) \
226 DEF_CAST_NULLABLE(type2, type1)
246 #undef DEF_CAST_NULLABLE_BIDIR
247 #undef DEF_CAST_NULLABLE
250 return operand == null_val ? operand : (operand ? 0 : 1);
255 const int8_t null_val) {
256 if (lhs == null_val) {
257 return rhs == 0 ? rhs : null_val;
259 if (rhs == null_val) {
260 return lhs == 0 ? lhs : null_val;
262 return (lhs && rhs) ? 1 : 0;
267 const int8_t null_val) {
268 if (lhs == null_val) {
269 return rhs == 0 ? null_val : rhs;
271 if (rhs == null_val) {
272 return lhs == 0 ? null_val : lhs;
274 return (lhs || rhs) ? 1 : 0;
285 const int64_t min_val) {
286 const uint64_t bitmap_idx = val - min_val;
287 reinterpret_cast<int8_t*
>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
290 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
304 const uint32_t index = hash >> (64 - b);
305 const uint8_t rank =
get_rank(hash << b, 64 - b);
306 uint8_t* M =
reinterpret_cast<uint8_t*
>(*agg);
307 M[index] = std::max(M[index], rank);
318 const int64_t min_val,
319 const int64_t max_val,
320 const int64_t null_val,
321 const int8_t null_bool_val) {
322 if (val == null_val) {
323 return null_bool_val;
325 if (val < min_val || val > max_val) {
331 const uint64_t bitmap_idx = val - min_val;
332 return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
333 (1 << (bitmap_idx & 7))
339 const auto old = *agg;
345 *agg = std::max(*agg, val);
349 *agg = std::min(*agg, val);
358 const int64_t min_val,
359 const int64_t skip_val) {
360 if (val != skip_val) {
379 const auto old = *agg;
384 #define DEF_AGG_MAX_INT(n) \
385 extern "C" ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, const int##n##_t val) { \
386 *agg = std::max(*agg, val); \
392 #undef DEF_AGG_MAX_INT
394 #define DEF_AGG_MIN_INT(n) \
395 extern "C" ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, const int##n##_t val) { \
396 *agg = std::min(*agg, val); \
402 #undef DEF_AGG_MIN_INT
404 #define DEF_AGG_ID_INT(n) \
405 extern "C" ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, const int##n##_t val) { \
412 #undef DEF_AGG_ID_INT
414 #define DEF_WRITE_PROJECTION_INT(n) \
415 extern "C" ALWAYS_INLINE void write_projection_int##n( \
416 int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
417 if (val != init_val) { \
418 *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
424 #undef DEF_WRITE_PROJECTION_INT
428 const int64_t skip_val) {
429 const auto old = *agg;
430 if (val != skip_val) {
431 if (old != skip_val) {
442 const int32_t skip_val) {
443 const auto old = *agg;
444 if (val != skip_val) {
445 if (old != skip_val) {
456 const int64_t skip_val) {
457 if (val != skip_val) {
465 const int32_t skip_val) {
466 if (val != skip_val) {
472 #define DEF_SKIP_AGG_ADD(base_agg_func) \
473 extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
474 DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
475 if (val != skip_val) { \
476 base_agg_func(agg, val); \
480 #define DEF_SKIP_AGG(base_agg_func) \
481 extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
482 DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
483 if (val != skip_val) { \
484 const DATA_T old_agg = *agg; \
485 if (old_agg != skip_val) { \
486 base_agg_func(agg, val); \
493 #define DATA_T int64_t
498 #define DATA_T int32_t
503 #define DATA_T int16_t
508 #define DATA_T int8_t
513 #undef DEF_SKIP_AGG_ADD
523 const auto r = *
reinterpret_cast<const double*
>(agg) + val;
524 *agg = *
reinterpret_cast<const int64_t*
>(may_alias_ptr(&r));
528 const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
529 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&r)));
533 const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
534 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&r)));
538 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&val)));
546 const auto r = *
reinterpret_cast<const float*
>(agg) + val;
547 *agg = *
reinterpret_cast<const int32_t*
>(may_alias_ptr(&r));
551 const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
552 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&r)));
556 const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
557 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&r)));
561 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&val)));
566 const double skip_val) {
567 if (val != skip_val) {
575 const float skip_val) {
576 if (val != skip_val) {
582 #define DEF_SKIP_AGG_ADD(base_agg_func) \
583 extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
584 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
585 if (val != skip_val) { \
586 base_agg_func(agg, val); \
590 #define DEF_SKIP_AGG(base_agg_func) \
591 extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
592 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
593 if (val != skip_val) { \
594 const ADDR_T old_agg = *agg; \
595 if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
596 base_agg_func(agg, val); \
598 *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
603 #define DATA_T double
604 #define ADDR_T int64_t
612 #define ADDR_T int32_t
619 #undef DEF_SKIP_AGG_ADD
624 return x / scale * scale;
629 return x / scale * scale - scale;
638 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
639 extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
640 const int64_t val) { \
644 extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
645 uint64_t* agg, const int64_t val, const int64_t skip_val) { \
648 extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
649 const int32_t val) { \
653 extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
654 uint32_t* agg, const int32_t val, const int32_t skip_val) { \
658 extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
659 const double val) { \
663 extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
664 uint64_t* agg, const double val, const double skip_val) { \
667 extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
672 extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
673 uint32_t* agg, const float val, const float skip_val) { \
677 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
678 extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
680 extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
681 int64_t* agg, const int64_t val, const int64_t skip_val) {} \
682 extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
683 const int32_t val) {} \
684 extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
685 const int16_t val) {} \
686 extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
687 const int8_t val) {} \
689 extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
690 int32_t* agg, const int32_t val, const int32_t skip_val) {} \
692 extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
693 const double val) {} \
695 extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
696 int64_t* agg, const double val, const double skip_val) {} \
697 extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
698 const float val) {} \
700 extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
701 int32_t* agg, const float val, const float skip_val) {}
710 const int16_t skip_val) {}
714 const int8_t skip_val) {}
718 const int16_t skip_val) {}
722 const int8_t skip_val) {}
732 const int64_t skip_val) {
741 const int32_t skip_val) {
749 const double skip_val) {}
754 const float skip_val) {}
764 int32_t row_index_resume{0};
766 row_index_resume = error_code[0];
769 return row_index_resume;
787 int32_t* error_codes) {
804 const int32_t groups_buffer_size) {
817 const int32_t groups_buffer_size) {
823 const int32_t groups_buffer_size) {
862 const int8_t warp_size) {
867 extern "C" __attribute__((noinline)) void init_columnar_group_by_buffer_gpu(
880 extern "C" __attribute__((noinline)) void init_group_by_buffer_impl(
887 const int8_t warp_size) {
892 template <
typename T>
896 const uint32_t key_count,
897 const uint32_t row_size_quad) {
898 auto off = h * row_size_quad;
899 auto row_ptr =
reinterpret_cast<T*
>(groups_buffer + off);
900 if (*row_ptr == get_empty_key<T>()) {
901 memcpy(row_ptr, key, key_count *
sizeof(T));
902 auto row_ptr_i8 =
reinterpret_cast<int8_t*
>(row_ptr + key_count);
905 if (memcmp(row_ptr, key, key_count *
sizeof(T)) == 0) {
906 auto row_ptr_i8 =
reinterpret_cast<int8_t*
>(row_ptr + key_count);
915 const uint32_t key_count,
916 const uint32_t key_width,
917 const uint32_t row_size_quad,
923 reinterpret_cast<const int32_t*>(key),
933 template <
typename T>
935 const uint32_t entry_count,
938 const uint32_t key_count) {
941 if (key_buffer[off] == get_empty_key<T>()) {
942 for (
size_t i = 0; i < key_count; ++i) {
943 key_buffer[off] = key[i];
949 for (
size_t i = 0; i < key_count; ++i) {
950 if (key_buffer[off] != key[i]) {
960 const uint32_t entry_count,
963 const uint32_t key_count,
964 const uint32_t key_width) {
970 reinterpret_cast<const int32_t*>(key),
974 groups_buffer, entry_count, h, key, key_count);
986 const size_t entry_count) {
990 groups_buffer[off] = key[i];
993 return &groups_buffer[off];
997 if (groups_buffer[off] != key[i]) {
1002 return &groups_buffer[off];
1018 const uint32_t hashed_index,
1020 const uint32_t key_count,
1021 const uint32_t row_size_quad) {
1022 uint32_t off = hashed_index * row_size_quad;
1024 for (uint32_t i = 0; i < key_count; ++i) {
1025 groups_buffer[off + i] = key[i];
1028 return groups_buffer + off + key_count;
1037 const uint32_t hashed_index,
1039 const uint32_t key_count,
1040 const uint32_t entry_count) {
1042 for (uint32_t i = 0; i < key_count; i++) {
1043 groups_buffer[i * entry_count + hashed_index] = key[i];
1054 const int64_t min_key,
1056 const uint32_t row_size_quad) {
1057 return groups_buffer + row_size_quad * (key - min_key);
1063 const int64_t min_key,
1065 const uint32_t row_size_quad,
1067 const uint8_t warp_size) {
1068 return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1072 return reinterpret_cast<int8_t*
>(str_and_len & 0xffffffffffff);
1076 return static_cast<int64_t
>(str_and_len) >> 48;
1080 const uint64_t str_and_len) {
1085 const uint64_t str_and_len) {
1090 return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1091 (
static_cast<const uint64_t
>(len) << 48);
1095 #include "../Utils/StringLike.cpp"
1103 const int32_t str_len) {
1108 const int32_t str_len,
1109 const int32_t int_null) {
1121 const int64_t pos) {
1122 return reinterpret_cast<const int64_t*
>(output_buff)[pos];
1126 const int64_t pos) {
1127 return reinterpret_cast<const double*
>(output_buff)[pos];
1131 return *
reinterpret_cast<const double*
>(may_alias_ptr(agg));
1135 return *
reinterpret_cast<const float*
>(may_alias_ptr(agg));
1139 const int64_t* count,
1140 const double null_val) {
1141 return *count != 0 ?
static_cast<double>(*sum) / *count : null_val;
1145 const int64_t* count,
1146 const double null_val,
1147 const uint32_t scale) {
1148 return *count != 0 ? (
static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
1152 const int64_t* count,
1153 const double null_val) {
1154 return *count != 0 ? *
reinterpret_cast<const double*
>(may_alias_ptr(agg)) / *count
1159 const int32_t* count,
1160 const double null_val) {
1161 return *count != 0 ? *
reinterpret_cast<const float*
>(may_alias_ptr(agg)) / *count
1166 const uint32_t bitmap_bytes,
1167 const uint8_t* key_bytes,
1168 const uint32_t key_len) {
1169 const uint32_t bit_pos =
MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1170 const uint32_t word_idx = bit_pos / 32;
1171 const uint32_t bit_idx = bit_pos % 32;
1172 reinterpret_cast<uint32_t*
>(bitmap)[word_idx] |= 1 << bit_idx;
1175 extern "C" __attribute__((noinline)) void query_stub_hoisted_literals(
1176 const int8_t** col_buffers,
1186 int32_t* total_matched) {
1187 assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
1188 init_agg_value || out || frag_idx || error_code || join_hash_tables ||
1193 const uint64_t* num_fragments,
1198 int32_t* total_matched,
1202 const uint32_t* num_tables_ptr,
1204 for (uint32_t i = 0; i < *num_fragments; ++i) {
1205 query_stub_hoisted_literals(col_buffers ? col_buffers[i] :
nullptr,
1207 &num_rows[i * (*num_tables_ptr)],
1208 &frag_row_offsets[i * (*num_tables_ptr)],
1219 extern "C" __attribute__((noinline)) void query_stub(const int8_t** col_buffers,
1228 int32_t* total_matched) {
1229 assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
1230 out || frag_idx || error_code || join_hash_tables || total_matched);
1234 const uint64_t* num_fragments,
1238 int32_t* total_matched,
1242 const uint32_t* num_tables_ptr,
1244 for (uint32_t i = 0; i < *num_fragments; ++i) {
1245 query_stub(col_buffers ? col_buffers[i] :
nullptr,
1246 &num_rows[i * (*num_tables_ptr)],
1247 &frag_row_offsets[i * (*num_tables_ptr)],
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_from_smem_to_gmem_count_binId(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
#define DEF_UMINUS_NULLABLE(type, null_type)
const int32_t groups_buffer_size return groups_buffer
int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t * join_hash_tables
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
const int8_t const int64_t * num_rows
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
void agg_min_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
__device__ const int64_t * init_shared_mem(const int64_t *groups_buffer, const int32_t groups_buffer_size)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
#define DEF_CAST_NULLABLE(from_type, to_type)
__device__ int32_t pos_step_impl()
ALWAYS_INLINE double load_double(const int64_t *agg)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
#define DEF_AGG_MAX_INT(n)
void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
void agg_max_int16(int16_t *agg, const int16_t val)
void agg_min_int8(int8_t *agg, const int8_t val)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
const int64_t const uint32_t groups_buffer_entry_count
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
const int64_t const uint32_t const uint32_t key_qw_count
ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
#define DEF_SKIP_AGG(base_agg_func)
#define DEF_WRITE_PROJECTION_INT(n)
ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
__device__ const int64_t * init_shared_mem_dynamic(const int64_t *groups_buffer, const int32_t groups_buffer_size)
ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
int32_t extract_str_len_noinline(const uint64_t str_and_len)
void agg_min_int16(int16_t *agg, const int16_t val)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t * error_code
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
#define DEF_AGG_MIN_INT(n)
ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
const int8_t const int64_t const uint64_t const int32_t * max_matched
__device__ void sync_warp()
ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
__device__ void agg_from_smem_to_gmem_nop(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
int64_t const int32_t sz assert(dest)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t frag_idx
__device__ void write_back(int64_t *dest, int64_t *src, const int32_t sz)
const int8_t const int64_t const uint64_t const int32_t const int64_t * init_agg_value
void agg_max_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
__device__ void agg_from_smem_to_gmem_binId_count(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
const int64_t const uint32_t const uint32_t const uint32_t const bool const bool blocks_share_memory
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
ALWAYS_INLINE float load_float(const int32_t *agg)
ALWAYS_INLINE int32_t record_error_code(const int32_t err_code, int32_t *error_codes)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
const int8_t const int64_t const uint64_t * frag_row_offsets
ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t ** out
void agg_max_int8(int8_t *agg, const int8_t val)
__attribute__((noinline)) int32_t pos_start_impl(int32_t *error_code)
const int64_t * init_vals
#define DEF_SHARED_AGG_STUBS(base_agg_func)
ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
#define DEF_AGG_ID_INT(n)
NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ void force_sync()
ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
__device__ void write_back_smem_nop(int64_t *dest, int64_t *src, const int32_t sz)
__device__ int32_t group_buff_idx_impl()