18 #error This code is not intended to be compiled with a CUDA C++ compiler
45 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
46 extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable( \
47 const type lhs, const type rhs, const null_type null_val) { \
48 if (lhs != null_val && rhs != null_val) { \
49 return lhs opsym rhs; \
54 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
55 extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
56 const type lhs, const type rhs, const null_type null_val) { \
57 if (lhs != null_val) { \
58 return lhs opsym rhs; \
63 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
64 extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
65 const type lhs, const type rhs, const null_type null_val) { \
66 if (rhs != null_val) { \
67 return lhs opsym rhs; \
72 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
73 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable( \
76 const null_type null_val, \
77 const int8_t null_bool_val) { \
78 if (lhs != null_val && rhs != null_val) { \
79 return lhs opsym rhs; \
81 return null_bool_val; \
84 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
85 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
88 const null_type null_val, \
89 const int8_t null_bool_val) { \
90 if (lhs != null_val) { \
91 return lhs opsym rhs; \
93 return null_bool_val; \
96 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
97 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
100 const null_type null_val, \
101 const int8_t null_bool_val) { \
102 if (rhs != null_val) { \
103 return lhs opsym rhs; \
105 return null_bool_val; \
108 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
109 extern "C" RUNTIME_EXPORT ALWAYS_INLINE type safe_div_##type( \
110 const type lhs, const type rhs, const null_type null_val) { \
111 if (lhs != null_val && rhs != null_val && rhs != 0) { \
117 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
118 DEF_ARITH_NULLABLE(type, null_type, add, +) \
119 DEF_ARITH_NULLABLE(type, null_type, sub, -) \
120 DEF_ARITH_NULLABLE(type, null_type, mul, *) \
121 DEF_ARITH_NULLABLE(type, null_type, div, /) \
122 DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
123 DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
124 DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
125 DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
126 DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
127 DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
128 DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
129 DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
130 DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
131 DEF_CMP_NULLABLE(type, null_type, eq, ==) \
132 DEF_CMP_NULLABLE(type, null_type, ne, !=) \
133 DEF_CMP_NULLABLE(type, null_type, lt, <) \
134 DEF_CMP_NULLABLE(type, null_type, gt, >) \
135 DEF_CMP_NULLABLE(type, null_type, le, <=) \
136 DEF_CMP_NULLABLE(type, null_type, ge, >=) \
137 DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
138 DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
139 DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
140 DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
141 DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
142 DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
143 DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
144 DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
145 DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
146 DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
147 DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
148 DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
169 #undef DEF_BINARY_NULLABLE_ALL_OPS
170 #undef DEF_SAFE_DIV_NULLABLE
171 #undef DEF_CMP_NULLABLE_RHS
172 #undef DEF_CMP_NULLABLE_LHS
173 #undef DEF_CMP_NULLABLE
174 #undef DEF_ARITH_NULLABLE_RHS
175 #undef DEF_ARITH_NULLABLE_LHS
176 #undef DEF_ARITH_NULLABLE
178 #define DEF_MAP_STRING_TO_DATUM(value_type, value_name) \
179 extern "C" ALWAYS_INLINE DEVICE value_type map_string_to_datum_##value_name( \
180 const int32_t string_id, \
181 const int64_t translation_map_handle, \
182 const int32_t min_source_id) { \
183 const Datum* translation_map = \
184 reinterpret_cast<const Datum*>(translation_map_handle); \
185 const Datum& out_datum = translation_map[string_id - min_source_id]; \
186 return out_datum.value_name##val; \
197 #undef DEF_MAP_STRING_TO_DATUM
201 const uint64_t scale,
202 const int64_t operand_null_val,
203 const int64_t result_null_val) {
204 return operand != operand_null_val ? operand * scale : result_null_val;
210 const int64_t null_val) {
212 if (operand == null_val) {
216 int64_t tmp = scale >> 1;
217 tmp = operand >= 0 ? operand + tmp : operand - tmp;
224 const int64_t null_val) {
225 int64_t tmp = scale >> 1;
226 tmp = operand >= 0 ? operand + tmp : operand - tmp;
233 const int64_t divisor) {
234 return (dividend < 0 ? dividend - (divisor - 1) : dividend) / divisor;
241 const int64_t divisor,
242 const int64_t null_val) {
243 return dividend == null_val ? null_val :
floor_div_lhs(dividend, divisor);
246 #define DEF_UMINUS_NULLABLE(type, null_type) \
247 extern "C" RUNTIME_EXPORT ALWAYS_INLINE type uminus_##type##_nullable( \
248 const type operand, const null_type null_val) { \
249 return operand == null_val ? null_val : -operand; \
259 #undef DEF_UMINUS_NULLABLE
261 #define DEF_CAST_NULLABLE(from_type, to_type) \
262 extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
263 cast_##from_type##_to_##to_type##_nullable(const from_type operand, \
264 const from_type from_null_val, \
265 const to_type to_null_val) { \
266 return operand == from_null_val ? to_null_val : operand; \
269 #define DEF_CAST_SCALED_NULLABLE(from_type, to_type) \
270 extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
271 cast_##from_type##_to_##to_type##_scaled_nullable(const from_type operand, \
272 const from_type from_null_val, \
273 const to_type to_null_val, \
274 const to_type divider) { \
275 return operand == from_null_val ? to_null_val : operand / divider; \
278 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
279 DEF_CAST_NULLABLE(type1, type2) \
280 DEF_CAST_NULLABLE(type2, type1)
282 #define DEF_ROUND_NULLABLE(from_type, to_type) \
283 extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
284 cast_##from_type##_to_##to_type##_nullable(const from_type operand, \
285 const from_type from_null_val, \
286 const to_type to_null_val) { \
287 return operand == from_null_val \
289 : static_cast<to_type>(operand + (operand < from_type(0) \
291 : from_type(0.5))); \
325 #undef DEF_ROUND_NULLABLE
326 #undef DEF_CAST_NULLABLE_BIDIR
327 #undef DEF_CAST_SCALED_NULLABLE
328 #undef DEF_CAST_NULLABLE
331 const int8_t null_val) {
332 return operand == null_val ? operand : (operand ? 0 : 1);
337 const int8_t null_val) {
338 if (lhs == null_val) {
339 return rhs == 0 ? rhs : null_val;
341 if (rhs == null_val) {
342 return lhs == 0 ? lhs : null_val;
344 return (lhs && rhs) ? 1 : 0;
349 const int8_t null_val) {
350 if (lhs == null_val) {
351 return rhs == 0 ? null_val : rhs;
353 if (rhs == null_val) {
354 return lhs == 0 ? null_val : lhs;
356 return (lhs || rhs) ? 1 : 0;
368 const int64_t min_val,
369 const int64_t bucket_size) {
370 uint64_t bitmap_idx = val - min_val;
371 if (1 < bucket_size) {
372 bitmap_idx /=
static_cast<uint64_t
>(bucket_size);
374 reinterpret_cast<int8_t*
>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
378 #define GPU_RT_STUB NEVER_INLINE
380 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
395 const uint32_t index = hash >> (64 - b);
396 const uint8_t rank =
get_rank(hash << b, 64 - b);
397 uint8_t* M =
reinterpret_cast<uint8_t*
>(*agg);
398 M[index] = std::max(M[index], rank);
409 const int64_t min_val,
410 const int64_t max_val,
411 const int64_t null_val,
412 const int8_t null_bool_val) {
413 if (val == null_val) {
414 return null_bool_val;
416 if (val < min_val || val > max_val) {
422 const uint64_t bitmap_idx = val - min_val;
423 return bitset[bitmap_idx >> 3] & (1 << (bitmap_idx & 7)) ? 1 : 0;
428 const int64_t target_value,
429 const int64_t* col_buf) {
431 int64_t h = entry_cnt - 1;
433 int64_t mid = l + (h - l) / 2;
434 if (target_value < col_buf[mid]) {
445 return null_start_pos == 0 ? null_end_pos + 1 : 0;
450 const int64_t null_start_pos,
451 const int64_t null_end_pos) {
452 return null_end_pos == num_elems ? null_start_pos : num_elems;
455 template <
typename T,
typename Comparator>
457 const int64_t cur_row_idx,
459 const int32_t* partition_rowid_buf,
460 const int64_t* ordered_index_buf,
462 const bool nulls_first,
463 const int64_t null_start_pos,
464 const int64_t null_end_pos,
466 const auto target_value = col_buf[cur_row_idx];
467 if (target_value == null_val) {
468 for (int64_t target_offset = null_start_pos; target_offset < null_end_pos;
470 const auto candidate_offset = partition_rowid_buf[ordered_index_buf[target_offset]];
471 if (candidate_offset == cur_row_idx) {
472 return target_offset;
476 auto const modified_null_end_pos = nulls_first ? null_end_pos - 1 : null_end_pos;
480 int64_t mid = l + (h - l) / 2;
481 auto const target_row_idx = partition_rowid_buf[ordered_index_buf[mid]];
482 auto const cur_value = col_buf[target_row_idx];
483 if (cmp(target_value, cur_value)) {
489 int64_t target_offset = l;
490 int64_t candidate_row_idx = partition_rowid_buf[ordered_index_buf[target_offset]];
491 while (col_buf[candidate_row_idx] == target_value && target_offset < num_elems) {
492 if (candidate_row_idx == cur_row_idx) {
493 return target_offset;
495 candidate_row_idx = partition_rowid_buf[ordered_index_buf[++target_offset]];
500 #define DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(value_type, oper_name) \
501 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
502 compute_##value_type##_##oper_name##_current_row_idx_in_frame( \
503 const int64_t num_elems, \
504 const int64_t cur_row_idx, \
505 const value_type* col_buf, \
506 const int32_t* partition_rowid_buf, \
507 const int64_t* ordered_index_buf, \
508 const value_type null_val, \
509 const bool nulls_first, \
510 const int64_t null_start_pos, \
511 const int64_t null_end_pos) { \
512 return compute_current_row_idx_in_frame<value_type>(num_elems, \
515 partition_rowid_buf, \
521 std::oper_name<value_type>{}); \
523 #define DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME_ALL_TYPES(oper_name) \
524 DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(int8_t, oper_name) \
525 DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(int16_t, oper_name) \
526 DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(int32_t, oper_name) \
527 DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(int64_t, oper_name) \
528 DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(float, oper_name) \
529 DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME(double, oper_name)
534 #undef DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME_ALL_TYPES
535 #undef DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME
537 template <
typename TARGET_VAL_TYPE,
typename COL_TYPE,
typename NULL_TYPE>
539 const int64_t num_elems,
540 const TARGET_VAL_TYPE target_val,
541 const COL_TYPE* col_buf,
542 const int32_t* partition_rowid_buf,
543 const int64_t* ordered_index_buf,
544 const NULL_TYPE null_val,
545 const bool nulls_first,
546 const int64_t null_start_offset,
547 const int64_t null_end_offset) {
548 if (target_val == null_val) {
549 return null_start_offset;
551 auto const modified_null_end_pos = nulls_first ? null_end_offset - 1 : null_end_offset;
555 int64_t mid = l + (h - l) / 2;
556 if (target_val <= col_buf[partition_rowid_buf[ordered_index_buf[mid]]]) {
565 #define DEF_RANGE_MODE_FRAME_LOWER_BOUND( \
566 target_val_type, col_type, null_type, opname, opsym) \
567 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
568 range_mode_##target_val_type##_##col_type##_##null_type##_##opname##_frame_lower_bound( \
569 const int64_t num_elems, \
570 const target_val_type target_value, \
571 const col_type* col_buf, \
572 const int32_t* partition_rowid_buf, \
573 const int64_t* ordered_index_buf, \
574 const int64_t frame_bound_val, \
575 const null_type null_val, \
576 const bool nulls_first, \
577 const int64_t null_start_pos, \
578 const int64_t null_end_pos) { \
579 if (target_value == null_val) { \
580 return null_start_pos; \
582 target_val_type new_val = target_value opsym frame_bound_val; \
583 return compute_lower_bound_from_ordered_partition_index<target_val_type, \
589 partition_rowid_buf, \
616 #undef DEF_RANGE_MODE_FRAME_LOWER_BOUND
618 template <
typename TARGET_VAL_TYPE,
typename COL_TYPE,
typename NULL_TYPE>
620 const int64_t num_elems,
621 const TARGET_VAL_TYPE target_val,
622 const COL_TYPE* col_buf,
623 const int32_t* partition_rowid_buf,
624 const int64_t* ordered_index_buf,
625 const NULL_TYPE null_val,
626 const bool nulls_first,
627 const int64_t null_start_offset,
628 const int64_t null_end_offset) {
629 if (target_val == null_val) {
630 return null_end_offset;
632 auto const modified_null_end_pos = nulls_first ? null_end_offset - 1 : null_end_offset;
636 int64_t mid = l + (h - l) / 2;
637 if (target_val >= col_buf[partition_rowid_buf[ordered_index_buf[mid]]]) {
646 #define DEF_RANGE_MODE_FRAME_UPPER_BOUND( \
647 target_val_type, col_type, null_type, opname, opsym) \
648 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
649 range_mode_##target_val_type##_##col_type##_##null_type##_##opname##_frame_upper_bound( \
650 const int64_t num_elems, \
651 const target_val_type target_value, \
652 const col_type* col_buf, \
653 const int32_t* partition_rowid_buf, \
654 const int64_t* ordered_index_buf, \
655 const int64_t frame_bound_val, \
656 const null_type null_val, \
657 const bool nulls_first, \
658 const int64_t null_start_pos, \
659 const int64_t null_end_pos) { \
660 if (target_value == null_val) { \
661 return null_end_pos; \
663 target_val_type new_val = target_value opsym frame_bound_val; \
664 return compute_upper_bound_from_ordered_partition_index<target_val_type, \
670 partition_rowid_buf, \
697 #undef DEF_RANGE_MODE_FRAME_UPPER_BOUND
699 template <
typename COL_TYPE,
typename LOGICAL_TYPE>
701 const int64_t frame_start_offset,
702 const int64_t frame_end_offset,
703 const COL_TYPE* col_buf,
704 const int32_t* partition_rowid_buf,
705 const int64_t* ordered_index_buf,
706 const LOGICAL_TYPE logical_null_val,
707 const LOGICAL_TYPE col_null_val) {
708 if (target_row_idx_in_frame < frame_start_offset ||
709 target_row_idx_in_frame > frame_end_offset) {
710 return logical_null_val;
712 const auto target_offset =
713 partition_rowid_buf[ordered_index_buf[target_row_idx_in_frame]];
714 LOGICAL_TYPE target_val = col_buf[target_offset];
715 if (target_val == col_null_val) {
716 return logical_null_val;
721 #define DEF_GET_VALUE_IN_FRAME(col_type, logical_type) \
722 extern "C" RUNTIME_EXPORT ALWAYS_INLINE logical_type \
723 get_##col_type##_value_##logical_type##_type_in_frame( \
724 const int64_t target_row_idx_in_frame, \
725 const int64_t frame_start_offset, \
726 const int64_t frame_end_offset, \
727 const col_type* col_buf, \
728 const int32_t* partition_rowid_buf, \
729 const int64_t* ordered_index_buf, \
730 const logical_type logical_null_val, \
731 const logical_type col_null_val) { \
732 return get_value_in_window_frame<col_type, logical_type>(target_row_idx_in_frame, \
733 frame_start_offset, \
736 partition_rowid_buf, \
753 #undef DEF_GET_VALUE_IN_FRAME
757 int64_t multiplier) {
758 return decoded_val == null_val ? decoded_val : decoded_val * multiplier;
763 int64_t current_partition_start_offset,
764 int64_t frame_bound) {
765 int64_t index = candidate_index - current_partition_start_offset - frame_bound;
766 return index < 0 ? 0 : index;
771 int64_t current_partition_start_offset,
773 int64_t num_current_partition_elem) {
774 int64_t index = candidate_index - current_partition_start_offset + frame_bound;
775 return index >= num_current_partition_elem ? num_current_partition_elem : index;
780 int64_t current_partition_start_offset,
781 int64_t frame_bound) {
782 int64_t index = candidate_index - current_partition_start_offset - frame_bound;
783 return index < 0 ? 0 : index + 1;
788 int64_t current_partition_start_offset,
790 int64_t num_current_partition_elem) {
791 int64_t index = candidate_index - current_partition_start_offset + frame_bound;
792 return index >= num_current_partition_elem ? num_current_partition_elem : index + 1;
796 int64_t** aggregation_trees,
797 size_t partition_idx) {
798 return aggregation_trees[partition_idx];
802 int64_t** aggregation_trees,
803 size_t partition_idx) {
804 double** casted_aggregation_trees =
reinterpret_cast<double**
>(aggregation_trees);
805 return casted_aggregation_trees[partition_idx];
812 return casted_aggregation_trees[partition_idx];
819 return casted_aggregation_trees[partition_idx];
825 for (
size_t i = 0; i < level; i++) {
826 offset += pow(tree_fanout, i);
833 template <AggFuncType AGG_FUNC_TYPE,
typename AGG_TYPE>
834 inline AGG_TYPE
agg_func(AGG_TYPE
const lhs, AGG_TYPE
const rhs) {
836 return std::min(lhs, rhs);
838 return std::max(lhs, rhs);
845 template <AggFuncType AGG_FUNC_TYPE,
typename AGG_TYPE>
847 AGG_TYPE* aggregation_tree_for_partition,
848 size_t query_range_start_idx,
849 size_t query_range_end_idx,
853 AGG_TYPE invalid_val,
856 size_t begin = leaf_start_idx + query_range_start_idx;
857 size_t end = leaf_start_idx + query_range_end_idx;
858 AGG_TYPE
res = init_val;
859 bool all_nulls =
true;
860 for (
int level = leaf_level; level >= 0; level--) {
861 size_t parentBegin = begin / tree_fanout;
862 size_t parentEnd = (end - 1) / tree_fanout;
863 if (parentBegin == parentEnd) {
864 for (
size_t pos = begin; pos < end; pos++) {
865 if (aggregation_tree_for_partition[pos] != null_val) {
867 res = agg_func<AGG_FUNC_TYPE>(
res, aggregation_tree_for_partition[pos]);
870 return all_nulls ? null_val :
res;
871 }
else if (parentBegin > parentEnd) {
874 size_t group_begin = (parentBegin * tree_fanout) + 1;
875 if (begin != group_begin) {
876 size_t limit = (parentBegin * tree_fanout) + tree_fanout + 1;
877 for (
size_t pos = begin; pos < limit; pos++) {
878 if (aggregation_tree_for_partition[pos] != null_val) {
880 res = agg_func<AGG_FUNC_TYPE>(
res, aggregation_tree_for_partition[pos]);
885 size_t group_end = (parentEnd * tree_fanout) + 1;
886 if (end != group_end) {
887 for (
size_t pos = group_end; pos < end; pos++) {
888 if (aggregation_tree_for_partition[pos] != null_val) {
890 res = agg_func<AGG_FUNC_TYPE>(
res, aggregation_tree_for_partition[pos]);
900 #define DEF_SEARCH_AGGREGATION_TREE(agg_value_type) \
901 extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_value_type \
902 search_##agg_value_type##_aggregation_tree( \
903 agg_value_type* aggregated_tree_for_partition, \
904 size_t query_range_start_idx, \
905 size_t query_range_end_idx, \
907 size_t tree_fanout, \
910 agg_value_type invalid_val, \
911 agg_value_type null_val, \
912 int32_t agg_type) { \
913 if (!aggregated_tree_for_partition || query_range_start_idx > query_range_end_idx) { \
916 switch (agg_type) { \
918 return compute_window_func_via_aggregation_tree<AggFuncType::MIN>( \
919 aggregated_tree_for_partition, \
920 query_range_start_idx, \
921 query_range_end_idx, \
924 std::numeric_limits<agg_value_type>::max(), \
929 return compute_window_func_via_aggregation_tree<AggFuncType::MAX>( \
930 aggregated_tree_for_partition, \
931 query_range_start_idx, \
932 query_range_end_idx, \
935 std::numeric_limits<agg_value_type>::lowest(), \
940 return compute_window_func_via_aggregation_tree<AggFuncType::SUM>( \
941 aggregated_tree_for_partition, \
942 query_range_start_idx, \
943 query_range_end_idx, \
946 static_cast<agg_value_type>(0), \
955 #undef DEF_SEARCH_AGGREGATION_TREE
957 template <
typename AGG_VALUE_TYPE>
961 size_t query_range_start_idx,
962 size_t query_range_end_idx,
965 AGG_VALUE_TYPE invalid_val,
966 AGG_VALUE_TYPE null_val) {
968 size_t begin = leaf_start_idx + query_range_start_idx;
969 size_t end = leaf_start_idx + query_range_end_idx;
972 bool all_nulls =
true;
973 for (
int level = leaf_level; level >= 0; level--) {
974 size_t parentBegin = begin / tree_fanout;
975 size_t parentEnd = (end - 1) / tree_fanout;
976 if (parentBegin == parentEnd) {
977 for (
size_t pos = begin; pos < end; pos++) {
978 if (aggregation_tree_for_partition[pos].sum != null_val) {
980 res.
sum += aggregation_tree_for_partition[pos].
sum;
981 res.
count += aggregation_tree_for_partition[pos].
count;
988 }
else if (parentBegin > parentEnd) {
992 size_t group_begin = (parentBegin * tree_fanout) + 1;
993 if (begin != group_begin) {
994 size_t limit = (parentBegin * tree_fanout) + tree_fanout + 1;
995 for (
size_t pos = begin; pos < limit; pos++) {
996 if (aggregation_tree_for_partition[pos].sum != null_val) {
998 res.
sum += aggregation_tree_for_partition[pos].
sum;
999 res.
count += aggregation_tree_for_partition[pos].
count;
1004 size_t group_end = (parentEnd * tree_fanout) + 1;
1005 if (end != group_end) {
1006 for (
size_t pos = group_end; pos < end; pos++) {
1007 if (aggregation_tree_for_partition[pos].sum != null_val) {
1009 res.
sum += aggregation_tree_for_partition[pos].
sum;
1010 res.
count += aggregation_tree_for_partition[pos].
count;
1014 begin = parentBegin;
1021 #define DEF_SEARCH_DERIVED_AGGREGATION_TREE(agg_value_type) \
1022 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double \
1023 search_##agg_value_type##_derived_aggregation_tree( \
1024 SumAndCountPair<agg_value_type>* aggregated_tree_for_partition, \
1025 size_t query_range_start_idx, \
1026 size_t query_range_end_idx, \
1027 size_t leaf_level, \
1028 size_t tree_fanout, \
1029 bool decimal_type, \
1031 agg_value_type invalid_val, \
1032 agg_value_type null_val, \
1033 int32_t agg_type) { \
1034 if (!aggregated_tree_for_partition || query_range_start_idx > query_range_end_idx) { \
1037 SumAndCountPair<agg_value_type> res{0, 0}; \
1038 compute_derived_aggregates<agg_value_type>(aggregated_tree_for_partition, \
1040 query_range_start_idx, \
1041 query_range_end_idx, \
1046 if (res.sum == null_val) { \
1048 } else if (res.count > 0) { \
1049 if (decimal_type) { \
1050 return (static_cast<double>(res.sum) / pow(10, scale)) / res.count; \
1052 return (static_cast<double>(res.sum)) / res.count; \
1054 return invalid_val; \
1060 #undef DEF_SEARCH_DERIVED_AGGREGATION_TREE
1062 #define DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG(agg_type, null_type) \
1063 extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_type \
1064 handle_null_val_##agg_type##_##null_type##_window_framing_agg( \
1065 agg_type res, null_type agg_null_val, agg_type input_col_null_val) { \
1066 if (res == agg_null_val) { \
1067 return input_col_null_val; \
1074 #undef DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG
1076 template <
typename T>
1080 int64_t
const num_elems_in_partition,
1081 int32_t*
const partition_rowid_buf,
1082 int64_t*
const ordered_index_buf,
1083 bool const is_forward_fill) {
1084 T const cur_val = col_buf[partition_rowid_buf[ordered_index_buf[cur_idx]]];
1085 if (cur_val == null_val) {
1086 if (is_forward_fill) {
1087 for (int64_t cand_idx = cur_idx - 1; cand_idx >= 0; --cand_idx) {
1088 T const candidate_val = col_buf[partition_rowid_buf[ordered_index_buf[cand_idx]]];
1089 if (candidate_val != null_val) {
1090 return candidate_val;
1094 for (int64_t cand_idx = cur_idx + 1; cand_idx < num_elems_in_partition;
1096 T const candidate_val = col_buf[partition_rowid_buf[ordered_index_buf[cand_idx]]];
1097 if (candidate_val != null_val) {
1098 return candidate_val;
1105 #define DEF_FILL_MISSING_VALUE(col_type) \
1106 extern "C" RUNTIME_EXPORT ALWAYS_INLINE col_type fill_##col_type##_missing_value( \
1107 int64_t const cur_row_idx_in_frame, \
1108 col_type const null_val, \
1109 col_type* const col_buf, \
1110 int64_t const num_elems_in_partition, \
1111 int32_t* const partition_rowid_buf, \
1112 int64_t* const ordered_index_buf, \
1113 bool const is_forward_fill) { \
1114 return fill_missing_value<col_type>(cur_row_idx_in_frame, \
1117 num_elems_in_partition, \
1118 partition_rowid_buf, \
1119 ordered_index_buf, \
1128 #undef DEF_FILL_MISSING_VALUE
1131 const auto old = *agg;
1138 const int8_t cond) {
1139 return cond ?
agg_sum(agg, val) : *agg;
1143 *agg = std::max(*agg, val);
1147 *agg = std::min(*agg, val);
1155 const int64_t offset,
1156 const int8_t* value,
1157 const int64_t size_bytes) {
1158 for (
auto i = 0; i < size_bytes; i++) {
1159 varlen_buffer[offset + i] = value[i];
1161 return &varlen_buffer[offset];
1166 if (val == null_val) {
1172 }
else if (*agg == null_val) {
1184 const int64_t min_val,
1185 const int64_t bucket_size,
1186 const int64_t skip_val) {
1187 if (val != skip_val) {
1208 const int32_t cond) {
1209 return cond ? (*agg)++ : *agg;
1213 const int32_t val) {
1214 const auto old = *agg;
1221 const int8_t cond) {
1225 #define DEF_AGG_MAX_INT(n) \
1226 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, \
1227 const int##n##_t val) { \
1228 *agg = std::max(*agg, val); \
1234 #undef DEF_AGG_MAX_INT
1236 #define DEF_AGG_MIN_INT(n) \
1237 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, \
1238 const int##n##_t val) { \
1239 *agg = std::min(*agg, val); \
1245 #undef DEF_AGG_MIN_INT
1247 #define DEF_AGG_ID_INT(n) \
1248 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, \
1249 const int##n##_t val) { \
1253 #define DEF_CHECKED_SINGLE_AGG_ID_INT(n) \
1254 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_int##n( \
1255 int##n##_t* agg, const int##n##_t val, const int##n##_t null_val) { \
1256 if (val == null_val) { \
1259 if (*agg == val) { \
1261 } else if (*agg == null_val) { \
1278 #undef DEF_AGG_ID_INT
1279 #undef DEF_CHECKED_SINGLE_AGG_ID_INT
1281 #define DEF_WRITE_PROJECTION_INT(n) \
1282 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void write_projection_int##n( \
1283 int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
1284 if (val != init_val) { \
1285 *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
1291 #undef DEF_WRITE_PROJECTION_INT
1295 const int64_t skip_val) {
1296 const auto old = *agg;
1297 if (val != skip_val) {
1298 if (old != skip_val) {
1309 const auto old = *agg;
1310 if (val != skip_val) {
1311 if (old != skip_val) {
1323 const int64_t skip_val,
1324 const int8_t cond) {
1331 const int32_t skip_val,
1332 const int8_t cond) {
1337 const int64_t cond) {
1338 return cond ? (*agg)++ : *agg;
1343 if (val != skip_val) {
1351 if (cond != skip_val) {
1359 if (val != skip_val) {
1367 if (cond != skip_val) {
1373 #define DEF_SKIP_AGG_ADD(base_agg_func) \
1374 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1375 DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
1376 if (val != skip_val) { \
1377 base_agg_func(agg, val); \
1381 #define DEF_SKIP_AGG(base_agg_func) \
1382 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1383 DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
1384 if (val != skip_val) { \
1385 const DATA_T old_agg = *agg; \
1386 if (old_agg != skip_val) { \
1387 base_agg_func(agg, val); \
1394 #define DATA_T int64_t
1399 #define DATA_T int32_t
1404 #define DATA_T int16_t
1409 #define DATA_T int8_t
1414 #undef DEF_SKIP_AGG_ADD
1426 const auto r = *
reinterpret_cast<const double*
>(agg) + val;
1427 *agg = *
reinterpret_cast<const int64_t*
>(may_alias_ptr(&r));
1432 const int8_t cond) {
1440 const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
1441 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&r)));
1446 const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
1447 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&r)));
1452 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&val)));
1457 if (val == null_val) {
1461 if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)))) {
1463 }
else if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&null_val)))) {
1464 *agg = *(
reinterpret_cast<const int64_t*
>(may_alias_ptr(&val)));
1479 const auto r = *
reinterpret_cast<const float*
>(agg) + val;
1480 *agg = *
reinterpret_cast<const int32_t*
>(may_alias_ptr(&r));
1485 const int8_t cond) {
1493 const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
1494 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&r)));
1499 const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
1500 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&r)));
1504 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&val)));
1509 if (val == null_val) {
1513 if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)))) {
1515 }
else if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&null_val)))) {
1516 *agg = *(
reinterpret_cast<const int32_t*
>(may_alias_ptr(&val)));
1526 if (val != skip_val) {
1534 if (val != skip_val) {
1540 #define DEF_SKIP_AGG(base_agg_func) \
1541 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1542 ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
1543 if (val != skip_val) { \
1544 const ADDR_T old_agg = *agg; \
1545 if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
1546 base_agg_func(agg, val); \
1548 *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
1553 #define DEF_SKIP_IF_AGG(skip_agg_func, base_agg_func) \
1554 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void skip_agg_func##_skip_val( \
1555 ADDR_T* agg, const DATA_T val, const DATA_T skip_val, const int8_t cond) { \
1557 base_agg_func##_skip_val(agg, val, skip_val); \
1561 #define DATA_T double
1562 #define ADDR_T int64_t
1570 #define DATA_T float
1571 #define ADDR_T int32_t
1580 #undef DEF_SKIP_IF_AGG
1583 const int64_t scale) {
1585 return x / scale * scale;
1590 return x / scale * scale - scale;
1594 const int64_t scale) {
1600 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
1601 extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
1602 const int64_t val) { \
1606 extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
1607 uint64_t* agg, const int64_t val, const int64_t skip_val) { \
1610 extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
1611 const int32_t val) { \
1615 extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
1616 uint32_t* agg, const int32_t val, const int32_t skip_val) { \
1620 extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
1621 const double val) { \
1625 extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
1626 uint64_t* agg, const double val, const double skip_val) { \
1629 extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
1630 const float val) { \
1634 extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
1635 uint32_t* agg, const float val, const float skip_val) { \
1639 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
1640 extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
1642 extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
1643 int64_t* agg, const int64_t val, const int64_t skip_val) {} \
1644 extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
1645 const int32_t val) {} \
1646 extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
1647 const int16_t val) {} \
1648 extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
1649 const int8_t val) {} \
1651 extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
1652 int32_t* agg, const int32_t val, const int32_t skip_val) {} \
1654 extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
1655 const double val) {} \
1657 extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
1658 int64_t* agg, const double val, const double skip_val) {} \
1659 extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
1660 const float val) {} \
1662 extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
1663 int32_t* agg, const float val, const float skip_val) {}
1672 const int64_t offset,
1673 const int8_t* value,
1674 const int64_t size_bytes) {
1680 const int64_t null_val) {
1687 const int32_t null_val) {
1693 const int16_t null_val) {
1698 const int8_t null_val) {
1705 const double null_val) {
1711 const float null_val) {
1717 const int16_t skip_val) {}
1721 const int8_t skip_val) {}
1725 const int16_t skip_val) {}
1729 const int8_t skip_val) {}
1739 const int8_t cond) {
1745 const int64_t skip_val) {
1751 const int64_t skip_val,
1752 const int8_t cond) {
1761 const int32_t skip_val) {
1769 const double skip_val) {}
1774 const float skip_val) {}
1778 const int8_t cond) {
1784 const int32_t skip_val,
1785 const int8_t cond) {
1791 const int8_t cond) {}
1795 const double skip_val,
1796 const int8_t cond) {}
1799 const int8_t cond) {}
1803 const float skip_val,
1804 const int8_t cond) {}
1813 int64_t* output_buffer,
1814 const int32_t num_agg_cols){};
1819 return row_index_resume ? *row_index_resume : 0;
1849 int32_t* error_codes) {
1868 const int64_t* groups_buffer,
1869 const int32_t groups_buffer_size) {
1870 return groups_buffer;
1883 const int32_t groups_buffer_size) {
1888 int64_t* groups_buffer,
1889 const int64_t* init_vals,
1890 const uint32_t groups_buffer_entry_count,
1891 const uint32_t key_qw_count,
1892 const uint32_t agg_col_count,
1894 const int8_t warp_size) {
1897 assert(groups_buffer);
1902 int64_t* groups_buffer,
1903 const int64_t* init_vals,
1904 const uint32_t groups_buffer_entry_count,
1905 const uint32_t key_qw_count,
1906 const uint32_t agg_col_count,
1908 const bool blocks_share_memory,
1909 const int32_t frag_idx) {
1912 assert(groups_buffer);
1917 int64_t* groups_buffer,
1918 const int64_t* init_vals,
1919 const uint32_t groups_buffer_entry_count,
1920 const uint32_t key_qw_count,
1921 const uint32_t agg_col_count,
1923 const int8_t warp_size) {
1926 assert(groups_buffer);
1930 template <
typename T>
1934 const uint32_t key_count,
1935 const uint32_t row_size_quad) {
1936 auto off = h * row_size_quad;
1937 auto row_ptr =
reinterpret_cast<T*
>(groups_buffer + off);
1938 if (*row_ptr == get_empty_key<T>()) {
1939 memcpy(row_ptr, key, key_count *
sizeof(
T));
1940 auto row_ptr_i8 =
reinterpret_cast<int8_t*
>(row_ptr + key_count);
1943 if (memcmp(row_ptr, key, key_count *
sizeof(
T)) == 0) {
1944 auto row_ptr_i8 =
reinterpret_cast<int8_t*
>(row_ptr + key_count);
1951 int64_t* groups_buffer,
1954 const uint32_t key_count,
1955 const uint32_t key_width,
1956 const uint32_t row_size_quad) {
1957 switch (key_width) {
1961 reinterpret_cast<const int32_t*>(key),
1971 template <
typename T>
1973 const uint32_t entry_count,
1976 const uint32_t key_count) {
1978 auto key_buffer =
reinterpret_cast<T*
>(groups_buffer);
1979 if (key_buffer[off] == get_empty_key<T>()) {
1980 for (
size_t i = 0; i < key_count; ++i) {
1981 key_buffer[off] = key[i];
1987 for (
size_t i = 0; i < key_count; ++i) {
1988 if (key_buffer[off] != key[i]) {
1998 const uint32_t entry_count,
2001 const uint32_t key_count,
2002 const uint32_t key_width) {
2003 switch (key_width) {
2008 reinterpret_cast<const int32_t*>(key),
2012 groups_buffer, entry_count, h, key, key_count);
2020 int64_t* groups_buffer,
2023 const uint32_t key_qw_count,
2024 const size_t entry_count) {
2027 for (
size_t i = 0; i < key_qw_count; ++i) {
2028 groups_buffer[off] = key[i];
2031 return &groups_buffer[off];
2034 for (
size_t i = 0; i < key_qw_count; ++i) {
2035 if (groups_buffer[off] != key[i]) {
2040 return &groups_buffer[off];
2055 int64_t* groups_buffer,
2056 const uint32_t hashed_index,
2058 const uint32_t key_count,
2059 const uint32_t row_size_quad) {
2060 uint32_t off = hashed_index * row_size_quad;
2062 for (uint32_t i = 0; i < key_count; ++i) {
2063 groups_buffer[off + i] = key[i];
2066 return groups_buffer + off + key_count;
2077 const uint32_t hashed_index,
2078 const uint32_t row_size_quad) {
2079 return groups_buffer + row_size_quad * hashed_index;
2088 const uint32_t hashed_index,
2090 const uint32_t key_count,
2091 const uint32_t entry_count) {
2093 for (uint32_t i = 0; i < key_count; i++) {
2094 groups_buffer[i * entry_count + hashed_index] = key[i];
2104 int64_t* groups_buffer,
2106 const int64_t min_key,
2108 const uint32_t row_size_quad) {
2109 return groups_buffer + row_size_quad * (key - min_key);
2113 int64_t* groups_buffer,
2115 const int64_t min_key,
2117 const uint32_t row_size_quad,
2119 const uint8_t warp_size) {
2120 return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
2124 const int32_t len) {
2125 return {
reinterpret_cast<char const*
>(ptr), static_cast<uint64_t>(len)};
2129 #include "../Utils/StringLike.cpp"
2156 const int64_t translation_map_handle,
2157 const int32_t min_source_id) {
2158 const int32_t* translation_map =
2159 reinterpret_cast<const int32_t*
>(translation_map_handle);
2160 return translation_map[string_id - min_source_id];
2164 const double* regressor_inputs,
2165 const int64_t decision_tree_table_handle,
2166 const int64_t decision_tree_offsets_handle,
2167 const int32_t num_regressors,
2168 const int32_t num_trees,
2169 const bool compute_avg,
2170 const double null_value) {
2171 for (int32_t regressor_idx = 0; regressor_idx < num_regressors; ++regressor_idx) {
2172 if (regressor_inputs[regressor_idx] == null_value) {
2178 const int64_t* decision_tree_offsets =
2179 reinterpret_cast<const int64_t*
>(decision_tree_offsets_handle);
2180 double sum_tree_results{0};
2181 for (int32_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
2182 int64_t row_idx = decision_tree_offsets[tree_idx];
2186 sum_tree_results += current_entry.
value;
2189 const auto regressor_input = regressor_inputs[current_entry.
feature_index];
2190 row_idx = regressor_input <= current_entry.
value
2195 return compute_avg ? sum_tree_results / num_trees : sum_tree_results;
2199 const double proportion,
2200 const int64_t row_offset) {
2201 const int64_t threshold = 4294967296 * proportion;
2202 return (row_offset * 2654435761) % 4294967296 < threshold;
2209 const double scale_factor,
2210 const int32_t partition_count) {
2211 if (target_value < lower_bound) {
2213 }
else if (target_value >= upper_bound) {
2214 return partition_count + 1;
2216 return ((target_value - lower_bound) * scale_factor) + 1;
2223 const double scale_factor,
2224 const int32_t partition_count) {
2225 if (target_value > lower_bound) {
2227 }
else if (target_value <= upper_bound) {
2228 return partition_count + 1;
2230 return ((lower_bound - target_value) * scale_factor) + 1;
2237 const double scale_factor,
2238 const int32_t partition_count,
2239 const double null_val) {
2240 if (target_value == null_val) {
2244 target_value, lower_bound, upper_bound, scale_factor, partition_count);
2251 const double scale_factor,
2252 const int32_t partition_count,
2253 const double null_val) {
2254 if (target_value == null_val) {
2258 target_value, lower_bound, upper_bound, scale_factor, partition_count);
2267 const double scale_factor) {
2268 int32_t calc = (target_value -
lower_bound) * scale_factor;
2275 const double scale_factor) {
2276 int32_t calc = (lower_bound - target_value) * scale_factor;
2282 const bool reversed,
2285 const int32_t partition_count) {
2290 partition_count / (lower_bound - upper_bound),
2296 partition_count / (upper_bound - lower_bound),
2302 const bool reversed,
2305 const int32_t partition_count,
2306 const double null_val) {
2307 if (target_value == null_val) {
2311 target_value, reversed, lower_bound, upper_bound, partition_count);
2316 const bool reversed,
2319 const int32_t partition_count) {
2322 target_value, lower_bound, partition_count / (lower_bound - upper_bound));
2325 target_value, lower_bound, partition_count / (upper_bound - lower_bound));
2330 return reinterpret_cast<const int64_t*
>(output_buff)[pos];
2334 const int64_t output_buff,
2335 const int64_t pos) {
2336 return reinterpret_cast<const double*
>(output_buff)[pos];
2340 return *
reinterpret_cast<const double*
>(may_alias_ptr(agg));
2344 return *
reinterpret_cast<const float*
>(may_alias_ptr(agg));
2348 const int64_t* count,
2349 const double null_val) {
2350 return *count != 0 ?
static_cast<double>(*sum) / *count : null_val;
2354 const int64_t* count,
2355 const double null_val,
2356 const uint32_t scale) {
2357 return *count != 0 ? (
static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
2361 const int64_t* count,
2362 const double null_val) {
2363 return *count != 0 ? *
reinterpret_cast<const double*
>(may_alias_ptr(agg)) / *count
2368 const int32_t* count,
2369 const double null_val) {
2370 return *count != 0 ? *
reinterpret_cast<const float*
>(may_alias_ptr(agg)) / *count
2376 const uint32_t bitmap_bytes,
2377 const uint8_t* key_bytes,
2378 const uint32_t key_len) {
2379 const uint32_t bit_pos =
MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
2380 const uint32_t word_idx = bit_pos / 32;
2381 const uint32_t bit_idx = bit_pos % 32;
2382 reinterpret_cast<uint32_t*
>(bitmap)[word_idx] |= 1 << bit_idx;
2388 int32_t* total_matched,
2390 const uint32_t frag_idx,
2391 const uint32_t* row_index_resume,
2392 const int8_t** col_buffers,
2393 const int8_t* literals,
2394 const int64_t* num_rows,
2395 const uint64_t* frag_row_offsets,
2396 const int32_t* max_matched,
2397 const int64_t* init_agg_value,
2398 const int64_t* join_hash_tables,
2399 const int8_t* row_func_mgr) {
2401 assert(error_code || total_matched || out || frag_idx || row_index_resume ||
2402 col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
2403 init_agg_value || join_hash_tables || row_func_mgr);
2410 int32_t* total_matched,
2412 const uint32_t* num_fragments_ptr,
2413 const uint32_t* num_tables_ptr,
2414 const uint32_t* row_index_resume,
2415 const int8_t*** col_buffers,
2416 const int8_t* literals,
2417 const int64_t* num_rows,
2418 const uint64_t* frag_row_offsets,
2419 const int32_t* max_matched,
2420 const int64_t* init_agg_value,
2421 const int64_t* join_hash_tables,
2422 const int8_t* row_func_mgr) {
2423 uint32_t
const num_fragments = *num_fragments_ptr;
2424 uint32_t
const num_tables = *num_tables_ptr;
2426 for (uint32_t frag_idx = 0; frag_idx < num_fragments; ++frag_idx) {
2432 col_buffers ? col_buffers[frag_idx] :
nullptr,
2434 &num_rows[frag_idx * num_tables],
2435 &frag_row_offsets[frag_idx * num_tables],
2445 int32_t* total_matched,
2447 const uint32_t frag_idx,
2448 const uint32_t* row_index_resume,
2449 const int8_t** col_buffers,
2450 const int64_t* num_rows,
2451 const uint64_t* frag_row_offsets,
2452 const int32_t* max_matched,
2453 const int64_t* init_agg_value,
2454 const int64_t* join_hash_tables,
2455 const int8_t* row_func_mgr) {
2457 assert(error_code || total_matched || out || frag_idx || row_index_resume ||
2458 col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
2459 join_hash_tables || row_func_mgr);
2465 int32_t* total_matched,
2467 const uint32_t* num_fragments_ptr,
2468 const uint32_t* num_tables_ptr,
2469 const uint32_t* row_index_resume,
2470 const int8_t*** col_buffers,
2471 const int64_t* num_rows,
2472 const uint64_t* frag_row_offsets,
2473 const int32_t* max_matched,
2474 const int64_t* init_agg_value,
2475 const int64_t* join_hash_tables,
2476 const int8_t* row_func_mgr) {
2477 uint32_t
const num_fragments = *num_fragments_ptr;
2478 uint32_t
const num_tables = *num_tables_ptr;
2480 for (uint32_t frag_idx = 0; frag_idx < num_fragments; ++frag_idx) {
2486 col_buffers ? col_buffers[frag_idx] :
nullptr,
2487 &num_rows[frag_idx * num_tables],
2488 &frag_row_offsets[frag_idx * num_tables],
2506 if (command == static_cast<unsigned>(
INT_CHECK)) {
2512 if (command == static_cast<unsigned>(
INT_ABORT)) {
2516 if (command == static_cast<unsigned>(
INT_RESET)) {
DEVICE auto upper_bound(ARGS &&...args)
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
RUNTIME_EXPORT void agg_min_int8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t encode_date(int64_t decoded_val, int64_t null_val, int64_t multiplier)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_if_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_start_index_sub(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound)
#define DEF_UMINUS_NULLABLE(type, null_type)
GPU_RT_STUB int32_t checked_single_agg_id_int32_shared(int32_t *agg, const int32_t val, const int32_t null_val)
#define DEF_CHECKED_SINGLE_AGG_ID_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE int8_t * agg_id_varlen(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
RUNTIME_EXPORT void agg_max_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
int64_t left_child_row_idx
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_if(uint64_t *agg, const int64_t cond)
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
RUNTIME_EXPORT void agg_min_int16(int16_t *agg, const int16_t val)
RUNTIME_EXPORT ALWAYS_INLINE StringView string_pack(const int8_t *ptr, const int32_t len)
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend, const int64_t divisor, const int64_t null_val)
#define DEF_COMPUTE_CURRENT_ROW_IDX_IN_FRAME_ALL_TYPES(oper_name)
RUNTIME_EXPORT void agg_max_int16(int16_t *agg, const int16_t val)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
#define DEF_CAST_NULLABLE(from_type, to_type)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_if(int64_t *agg, const int64_t val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ int64_t get_thread_index()
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
__device__ void agg_sum_if_double_skip_val_shared(int64_t *agg, const double val, const double skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ int32_t pos_step_impl()
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
Calculate approximate median and general quantiles, based on "Computing Extremely Accurate Quantiles ...
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
Structures and runtime functions of streaming top-k heap.
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id(int64_t *agg, const int64_t val, const int64_t null_val)
__device__ void agg_sum_if_float_shared(int32_t *agg, const float val, const int8_t cond)
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
#define DEF_AGG_MAX_INT(n)
Definitions for core Datum union type.
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_integer_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
int64_t compute_upper_bound_from_ordered_partition_index(const int64_t num_elems, const TARGET_VAL_TYPE target_val, const COL_TYPE *col_buf, const int32_t *partition_rowid_buf, const int64_t *ordered_index_buf, const NULL_TYPE null_val, const bool nulls_first, const int64_t null_start_offset, const int64_t null_end_offset)
AGG_TYPE agg_func(AGG_TYPE const lhs, AGG_TYPE const rhs)
Macros and functions for groupby buffer compaction.
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
#define DEF_CAST_SCALED_NULLABLE(from_type, to_type)
RUNTIME_EXPORT ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size)
T fill_missing_value(int64_t const cur_idx, T const null_val, T *const col_buf, int64_t const num_elems_in_partition, int32_t *const partition_rowid_buf, int64_t *const ordered_index_buf, bool const is_forward_fill)
__device__ void agg_sum_if_double_shared(int64_t *agg, const double val, const int8_t cond)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
RUNTIME_EXPORT void agg_sum_if_float(int32_t *agg, const float val, const int8_t cond)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend, const int64_t divisor)
__device__ int32_t agg_sum_if_int32_shared(int32_t *agg, const int32_t val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash_keyless(int64_t *groups_buffer, const uint32_t hashed_index, const uint32_t row_size_quad)
__device__ int8_t * agg_id_varlen_shared(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_if_int32_skip_val(uint32_t *agg, const int32_t cond, const int32_t skip_val)
int64_t compute_current_row_idx_in_frame(const int64_t num_elems, const int64_t cur_row_idx, const T *col_buf, const int32_t *partition_rowid_buf, const int64_t *ordered_index_buf, const T null_val, const bool nulls_first, const int64_t null_start_pos, const int64_t null_end_pos, Comparator cmp)
__device__ int64_t * declare_dynamic_shared_memory()
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_end_index_add(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound, int64_t num_current_partition_elem)
ALWAYS_INLINE DEVICE int32_t map_string_dict_id(const int32_t string_id, const int64_t translation_map_handle, const int32_t min_source_id)
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
__device__ int64_t agg_sum_if_shared(int64_t *agg, const int64_t val, const int8_t cond)
#define DEF_ROUND_NULLABLE(from_type, to_type)
ALWAYS_INLINE DEVICE double tree_model_reg_predict(const double *regressor_inputs, const int64_t decision_tree_table_handle, const int64_t decision_tree_offsets_handle, const int32_t num_regressors, const int32_t num_trees, const bool compute_avg, const double null_value)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count)
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
__device__ bool check_interrupt()
#define DEF_WRITE_PROJECTION_INT(n)
GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t *agg, const int8_t val, const int8_t null_val)
RUNTIME_EXPORT NEVER_INLINE void query_stub(int32_t *error_code, int32_t *total_matched, int64_t **out, const uint32_t frag_idx, const uint32_t *row_index_resume, const int8_t **col_buffers, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, const int64_t *join_hash_tables, const int8_t *row_func_mgr)
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_start_index_add(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound, int64_t num_current_partition_elem)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
#define DEF_SEARCH_DERIVED_AGGREGATION_TREE(agg_value_type)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_no_oob_check(const double target_value, const double lower_bound, const double scale_factor)
RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
__device__ int64_t agg_sum_if_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE void agg_if_sum_float(int32_t *agg, const float val, const int8_t cond)
#define DEF_MAP_STRING_TO_DATUM(value_type, value_name)
RUNTIME_EXPORT ALWAYS_INLINE int64_t get_valid_buf_start_pos(const int64_t null_start_pos, const int64_t null_end_pos)
#define DEF_AGG_MIN_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
RUNTIME_EXPORT NEVER_INLINE void query_stub_hoisted_literals(int32_t *error_code, int32_t *total_matched, int64_t **out, const uint32_t frag_idx, const uint32_t *row_index_resume, const int8_t **col_buffers, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, const int64_t *join_hash_tables, const int8_t *row_func_mgr)
RUNTIME_EXPORT void multifrag_query(int32_t *error_code, int32_t *total_matched, int64_t **out, const uint32_t *num_fragments_ptr, const uint32_t *num_tables_ptr, const uint32_t *row_index_resume, const int8_t ***col_buffers, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, const int64_t *join_hash_tables, const int8_t *row_func_mgr)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
RUNTIME_EXPORT ALWAYS_INLINE int32_t width_bucket_nullable(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE float load_float(const int32_t *agg)
__device__ int32_t runtime_interrupt_flag
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
RUNTIME_EXPORT NEVER_INLINE void init_columnar_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const bool blocks_share_memory, const int32_t frag_idx)
std::function< bool(const PermutationIdx, const PermutationIdx)> Comparator
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
__device__ void sync_warp()
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_if_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
RUNTIME_EXPORT ALWAYS_INLINE double * get_double_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_reversed(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t *agg, const float val, const float null_val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
int64_t right_child_row_idx
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_if_skip_val(uint64_t *agg, const int64_t cond, const int64_t skip_val)
LOGICAL_TYPE get_value_in_window_frame(const int64_t target_row_idx_in_frame, const int64_t frame_start_offset, const int64_t frame_end_offset, const COL_TYPE *col_buf, const int32_t *partition_rowid_buf, const int64_t *ordered_index_buf, const LOGICAL_TYPE logical_null_val, const LOGICAL_TYPE col_null_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t get_valid_buf_end_pos(const int64_t num_elems, const int64_t null_start_pos, const int64_t null_end_pos)
RUNTIME_EXPORT ALWAYS_INLINE double load_double(const int64_t *agg)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
#define DEF_FILL_MISSING_VALUE(col_type)
RUNTIME_EXPORT ALWAYS_INLINE int8_t bit_is_set(const int8_t *bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr_no_oob_check(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count)
DEVICE auto lower_bound(ARGS &&...args)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_if_int32(uint32_t *agg, const int32_t cond)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
#define DEF_SEARCH_AGGREGATION_TREE(agg_value_type)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
void compute_derived_aggregates(SumAndCountPair< AGG_VALUE_TYPE > *aggregation_tree_for_partition, SumAndCountPair< AGG_VALUE_TYPE > &res, size_t query_range_start_idx, size_t query_range_end_idx, size_t leaf_level, size_t tree_fanout, AGG_VALUE_TYPE invalid_val, AGG_VALUE_TYPE null_val)
#define DEF_RANGE_MODE_FRAME_UPPER_BOUND(target_val_type, col_type, null_type, opname, opsym)
RUNTIME_EXPORT void agg_max_int8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE size_t getStartOffsetForSegmentTreeTraversal(size_t level, size_t tree_fanout)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t *agg, const double val, const double null_val)
GPU_RT_STUB int32_t checked_single_agg_id_int16_shared(int16_t *agg, const int16_t val, const int16_t null_val)
RUNTIME_EXPORT void multifrag_query_hoisted_literals(int32_t *error_code, int32_t *total_matched, int64_t **out, const uint32_t *num_fragments_ptr, const uint32_t *num_tables_ptr, const uint32_t *row_index_resume, const int8_t ***col_buffers, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, const int64_t *join_hash_tables, const int8_t *row_func_mgr)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
RUNTIME_EXPORT ALWAYS_INLINE SumAndCountPair< double > * get_double_derived_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
RUNTIME_EXPORT NEVER_INLINE void init_group_by_buffer_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const int8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
RUNTIME_EXPORT ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
#define DEF_GET_VALUE_IN_FRAME(col_type, logical_type)
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_end_index_sub(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound)
AGG_TYPE compute_window_func_via_aggregation_tree(AGG_TYPE *aggregation_tree_for_partition, size_t query_range_start_idx, size_t query_range_end_idx, size_t leaf_level, size_t tree_fanout, AGG_TYPE init_val, AGG_TYPE invalid_val, AGG_TYPE null_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr_nullable(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count, const double null_val)
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_if_int32(int32_t *agg, const int32_t val, const int8_t cond)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t width_bucket_reversed_nullable(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count, const double null_val)
int64_t compute_lower_bound_from_ordered_partition_index(const int64_t num_elems, const TARGET_VAL_TYPE target_val, const COL_TYPE *col_buf, const int32_t *partition_rowid_buf, const int64_t *ordered_index_buf, const NULL_TYPE null_val, const bool nulls_first, const int64_t null_start_offset, const int64_t null_end_offset)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void sync_threadblock()
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_int64_t_lower_bound(const int64_t entry_cnt, const int64_t target_value, const int64_t *col_buf)
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
RUNTIME_EXPORT bool check_interrupt_init(unsigned command)
RUNTIME_EXPORT ALWAYS_INLINE int32_t get_error_code(int32_t *error_codes)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
#define DEF_SHARED_AGG_STUBS(base_agg_func)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
#define DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG(agg_type, null_type)
#define DEF_AGG_ID_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE void record_error_code(const int32_t err_code, int32_t *error_codes)
__device__ void agg_sum_if_float_skip_val_shared(int32_t *agg, const float val, const float skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion, const int64_t row_offset)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
RUNTIME_EXPORT ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
__device__ void force_sync()
#define DEF_SKIP_IF_AGG(skip_agg_func, base_agg_func)
#define DEF_RANGE_MODE_FRAME_LOWER_BOUND(target_val_type, col_type, null_type, opname, opsym)
RUNTIME_EXPORT void agg_min_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_if_double(int64_t *agg, const double val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
__device__ int32_t agg_sum_if_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val, const int8_t cond)
RUNTIME_EXPORT ALWAYS_INLINE SumAndCountPair< int64_t > * get_integer_derived_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_reversed_no_oob_check(const double target_value, const double lower_bound, const double scale_factor)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count)
__device__ int32_t group_buff_idx_impl()